729 files changed, 36762 insertions, 12437 deletions
diff --git a/llvm/test/Analysis/BranchProbabilityInfo/pr22718.ll b/llvm/test/Analysis/BranchProbabilityInfo/pr22718.ll
index ed851f2..67ce44e 100644
--- a/llvm/test/Analysis/BranchProbabilityInfo/pr22718.ll
+++ b/llvm/test/Analysis/BranchProbabilityInfo/pr22718.ll
@@ -12,14 +12,14 @@
 @.str = private unnamed_addr constant [17 x i8] c"x = %lu\0Ay = %lu\0A\00", align 1
 
 ; Function Attrs: inlinehint nounwind uwtable
-define i32 @main() #0 {
+define i32 @main() {
 entry:
   %retval = alloca i32, align 4
   %i = alloca i64, align 8
   store i32 0, ptr %retval
   store i64 0, ptr @y, align 8
   store i64 0, ptr @x, align 8
-  call void @srand(i32 422304) #3
+  call void @srand(i32 422304)
   store i64 0, ptr %i, align 8
   br label %for.cond
 
@@ -29,7 +29,7 @@ for.cond:                                         ; preds = %for.inc, %entry
   br i1 %cmp, label %for.body, label %for.end, !prof !1
 
 for.body:                                         ; preds = %for.cond
-  %call = call i32 @rand() #3
+  %call = call i32 @rand()
   %conv = sitofp i32 %call to double
   %mul = fmul double %conv, 1.000000e+02
   %div = fdiv double %mul, 0x41E0000000000000
@@ -65,17 +65,12 @@ for.end:                                          ; preds = %for.cond
 }
 
 ; Function Attrs: nounwind
-declare void @srand(i32) #1
+declare void @srand(i32)
 
 ; Function Attrs: nounwind
-declare i32 @rand() #1
+declare i32 @rand()
 
-declare i32 @printf(ptr, ...) #2
-
-attributes #0 = { inlinehint nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+declare i32 @printf(ptr, ...)
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
index e007800..ac654dd 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sve | FileCheck %s --check-prefix=CHECK-VSCALE-1
-; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -mcpu=neoverse-v1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sve | FileCheck %s --check-prefix=CHECK-VSCALE-2
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -mcpu=neoverse-v1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sve | FileCheck %s --check-prefixes=CHECK-VSCALE-2,CHECK-SVE
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -mcpu=neoverse-v1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sve2p1 | FileCheck %s --check-prefixes=CHECK-VSCALE-2,CHECK-SVE2p1-OR-SME2
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -mcpu=neoverse-v1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sme2 | FileCheck %s --check-prefixes=CHECK-VSCALE-2,CHECK-SVE2p1-OR-SME2
 ; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -intrinsic-cost-strategy=type-based-intrinsic-cost -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sve | FileCheck %s --check-prefix=TYPE_BASED_ONLY
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
@@ -920,7 +922,8 @@ define void @get_lane_mask() #0 {
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 poison, i32 poison)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 2 for: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 13 for: %mask_nxv64i1_i64 = call <vscale x 64 x i1> @llvm.get.active.lane.mask.nxv64i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 5 for: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 poison, i64 poison)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 poison, i64 poison)
@@ -934,28 +937,53 @@ define void @get_lane_mask() #0 {
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 poison, i16 poison)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-; CHECK-VSCALE-2-LABEL: 'get_lane_mask'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 poison, i64 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv8i1_i64 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 poison, i64 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv4i1_i64 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 poison, i64 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv2i1_i64 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 poison, i64 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i32 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 poison, i32 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 poison, i32 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 2 for: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 poison, i64 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 poison, i64 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 8 for: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 poison, i64 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 4 for: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 poison, i64 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 32 for: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 poison, i32 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 16 for: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 poison, i32 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 8 for: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 poison, i32 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 4 for: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 poison, i32 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 48 for: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 poison, i64 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 poison, i16 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-SVE-LABEL: 'get_lane_mask'
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 poison, i64 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv8i1_i64 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 poison, i64 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv4i1_i64 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 poison, i64 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv2i1_i64 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 poison, i64 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i32 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 poison, i32 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 poison, i32 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 13 for: %mask_nxv64i1_i64 = call <vscale x 64 x i1> @llvm.get.active.lane.mask.nxv64i1.i64(i64 poison, i64 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 5 for: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 poison, i64 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 poison, i64 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 8 for: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 poison, i64 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 4 for: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 poison, i64 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 32 for: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 poison, i32 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 16 for: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 poison, i32 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 8 for: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 poison, i32 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 4 for: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 poison, i32 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 48 for: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 poison, i64 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 poison, i16 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-SVE2p1-OR-SME2-LABEL: 'get_lane_mask'
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 poison, i64 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv8i1_i64 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 poison, i64 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv4i1_i64 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 poison, i64 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv2i1_i64 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 poison, i64 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i32 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 poison, i32 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 poison, i32 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 5 for: %mask_nxv64i1_i64 = call <vscale x 64 x i1> @llvm.get.active.lane.mask.nxv64i1.i64(i64 poison, i64 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 poison, i64 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 poison, i64 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 8 for: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 poison, i64 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 4 for: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 poison, i64 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 32 for: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 poison, i32 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 16 for: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 poison, i32 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 8 for: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 poison, i32 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 4 for: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 poison, i32 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 48 for: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 poison, i64 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 poison, i16 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'get_lane_mask'
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 poison, i64 poison)
@@ -966,7 +994,8 @@ define void @get_lane_mask() #0 {
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 poison, i32 poison)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 2 for: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 13 for: %mask_nxv64i1_i64 = call <vscale x 64 x i1> @llvm.get.active.lane.mask.nxv64i1.i64(i64 poison, i64 poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 5 for: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 poison, i64 poison)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 poison, i64 poison)
@@ -990,6 +1019,7 @@ define void @get_lane_mask() #0 {
   %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison)
   %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison)
 
+  %mask_nxv64i1_i64 = call <vscale x 64 x i1> @llvm.get.active.lane.mask.nxv64i1.i64(i64 poison, i64 poison)
   %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison)
   %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison)
 
@@ -1416,6 +1446,7 @@ declare <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32, i32)
 declare <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32, i32)
 declare <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32, i32)
 declare <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32, i32)
+declare <vscale x 64 x i1> @llvm.get.active.lane.mask.nxv64i1.i64(i64, i64)
 declare <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64, i64)
 declare <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16, i16)
 declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64, i64)
diff --git a/llvm/test/Analysis/CostModel/ARM/add-cast-vect.ll b/llvm/test/Analysis/CostModel/ARM/add-cast-vect.ll
index 1e9a5f7..c0410cf 100644
--- a/llvm/test/Analysis/CostModel/ARM/add-cast-vect.ll
+++ b/llvm/test/Analysis/CostModel/ARM/add-cast-vect.ll
@@ -1,4 +1,6 @@
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s  -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
+
 ; To see the assembly output: llc -mcpu=cortex-a9 < %s | FileCheck --check-prefix=ASM %s
 ; ASM lines below are only for reference, tests on that direction should go to tests/CodeGen/ARM
 
@@ -15,13 +17,18 @@ target triple = "armv7--linux-gnueabihf"
 %T464 = type <4 x i64>
 
 define void @direct(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
-; COST: function 'direct'
+; COST-LABEL: 'direct'
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = add <4 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
   %v0 = load %T432, ptr %loadaddr
 ; ASM: vld1.64
   %v1 = load %T432, ptr %loadaddr2
 ; ASM: vld1.64
-  %r3 = add %T432 %v0, %v1 
-; COST: cost of 1 for instruction: {{.*}} add <4 x i32>
+  %r3 = add %T432 %v0, %v1
 ; ASM: vadd.i32
   store %T432 %r3, ptr %storeaddr
 ; ASM: vst1.64
@@ -29,16 +36,22 @@ define void @direct(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 }
 
 define void @ups1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
-; COST: function 'ups1632'
+; COST-LABEL: 'ups1632'
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i16>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r1 = sext <4 x i16> %v0 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r2 = sext <4 x i16> %v1 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = add <4 x i32> %r1, %r2
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
   %v0 = load %T416, ptr %loadaddr
 ; ASM: vldr
   %v1 = load %T416, ptr %loadaddr2
 ; ASM: vldr
   %r1 = sext %T416 %v0 to %T432
   %r2 = sext %T416 %v1 to %T432
-; COST: cost of 0 for instruction: {{.*}} sext <4 x i16> {{.*}} to <4 x i32>
-  %r3 = add %T432 %r1, %r2 
-; COST: cost of 1 for instruction: {{.*}} add <4 x i32>
+  %r3 = add %T432 %r1, %r2
 ; ASM: vaddl.s16
   store %T432 %r3, ptr %storeaddr
 ; ASM: vst1.64
@@ -46,16 +59,22 @@ define void @ups1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 }
 
 define void @upu1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
-; COST: function 'upu1632'
+; COST-LABEL: 'upu1632'
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i16>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r1 = zext <4 x i16> %v0 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r2 = zext <4 x i16> %v1 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = add <4 x i32> %r1, %r2
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
   %v0 = load %T416, ptr %loadaddr
 ; ASM: vldr
   %v1 = load %T416, ptr %loadaddr2
 ; ASM: vldr
   %r1 = zext %T416 %v0 to %T432
   %r2 = zext %T416 %v1 to %T432
-; COST: cost of 0 for instruction: {{.*}} zext <4 x i16> {{.*}} to <4 x i32>
-  %r3 = add %T432 %r1, %r2 
-; COST: cost of 1 for instruction: {{.*}} add <4 x i32>
+  %r3 = add %T432 %r1, %r2
 ; ASM: vaddl.u16
   store %T432 %r3, ptr %storeaddr
 ; ASM: vst1.64
@@ -63,51 +82,66 @@ define void @upu1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 }
 
 define void @ups3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
-; COST: function 'ups3264'
+; COST-LABEL: 'ups3264'
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <2 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = add <2 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: %st = sext <2 x i32> %r3 to <2 x i64>
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <2 x i64> %st, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
   %v0 = load %T232, ptr %loadaddr
 ; ASM: vldr
   %v1 = load %T232, ptr %loadaddr2
 ; ASM: vldr
-  %r3 = add %T232 %v0, %v1 
+  %r3 = add %T232 %v0, %v1
 ; ASM: vadd.i32
-; COST: cost of 1 for instruction: {{.*}} add <2 x i32>
   %st = sext %T232 %r3 to %T264
 ; ASM: vmovl.s32
-; COST: cost of 1 for instruction: {{.*}} sext <2 x i32> {{.*}} to <2 x i64>
   store %T264 %st, ptr %storeaddr
 ; ASM: vst1.64
   ret void
 }
 
 define void @upu3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
-; COST: function 'upu3264'
+; COST-LABEL: 'upu3264'
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <2 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = add <2 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: %st = zext <2 x i32> %r3 to <2 x i64>
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <2 x i64> %st, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
   %v0 = load %T232, ptr %loadaddr
 ; ASM: vldr
   %v1 = load %T232, ptr %loadaddr2
 ; ASM: vldr
-  %r3 = add %T232 %v0, %v1 
+  %r3 = add %T232 %v0, %v1
 ; ASM: vadd.i32
-; COST: cost of 1 for instruction: {{.*}} add <2 x i32>
   %st = zext %T232 %r3 to %T264
 ; ASM: vmovl.u32
-; COST: cost of 1 for instruction: {{.*}} zext <2 x i32> {{.*}} to <2 x i64>
   store %T264 %st, ptr %storeaddr
 ; ASM: vst1.64
   ret void
 }
 
 define void @dn3216(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
-; COST: function 'dn3216'
+; COST-LABEL: 'dn3216'
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = add <4 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: %st = trunc <4 x i32> %r3 to <4 x i16>
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i16> %st, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
   %v0 = load %T432, ptr %loadaddr
 ; ASM: vld1.64
   %v1 = load %T432, ptr %loadaddr2
 ; ASM: vld1.64
-  %r3 = add %T432 %v0, %v1 
+  %r3 = add %T432 %v0, %v1
 ; ASM: vadd.i32
-; COST: cost of 1 for instruction: {{.*}} add <4 x i32>
   %st = trunc %T432 %r3 to %T416
 ; ASM: vmovn.i32
-; COST: cost of 1 for instruction: {{.*}} trunc <4 x i32> {{.*}} to <4 x i16>
   store %T416 %st, ptr %storeaddr
 ; ASM: vstr
   ret void
diff --git a/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll b/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll
index c2248c2..e5bbac6 100644
--- a/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll
+++ b/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mtriple=thumbv7-apple-ios6.0.0 -mcpu=cortex-a9 < %s | FileCheck %s --check-prefix=CHECK-NEON
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVE
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mtriple=thumbv8m.main-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-MAIN
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mtriple=thumbv8m.base-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-BASE
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mtriple=armv8r-none-eabi -mattr=+neon,+fp-armv8 < %s | FileCheck %s --check-prefix=CHECK-V8R
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=cortex-a9 < %s | FileCheck %s --check-prefix=CHECK-NEON
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVE
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8m.main-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-MAIN
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8m.base-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-BASE
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv8r-none-eabi -mattr=+neon,+fp-armv8 < %s | FileCheck %s --check-prefix=CHECK-V8R
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
diff --git a/llvm/test/Analysis/CostModel/ARM/fparith.ll b/llvm/test/Analysis/CostModel/ARM/fparith.ll
index f9424e8..6f2626c 100644
--- a/llvm/test/Analysis/CostModel/ARM/fparith.ll
+++ b/llvm/test/Analysis/CostModel/ARM/fparith.ll
@@ -1,21 +1,21 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVEFP
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVEFP
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define void @f32() {
 ; CHECK-MVE-LABEL: 'f32'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = fadd float undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = fsub float undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = fmul float undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %c = fadd float undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %d = fsub float undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %e = fmul float undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'f32'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = fadd float undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = fsub float undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = fmul float undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %c = fadd float undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %d = fsub float undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %e = fmul float undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %c = fadd float undef, undef
   %d = fsub float undef, undef
@@ -25,16 +25,16 @@ define void @f32() {
 
 define void @f16() {
 ; CHECK-MVE-LABEL: 'f16'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = fadd half undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = fsub half undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = fmul half undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %c = fadd half undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %d = fsub half undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %e = fmul half undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'f16'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = fadd half undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = fsub half undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = fmul half undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %c = fadd half undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %d = fsub half undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %e = fmul half undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %c = fadd half undef, undef
   %d = fsub half undef, undef
@@ -44,16 +44,16 @@ define void @f16() {
 
 define void @f64() {
 ; CHECK-MVE-LABEL: 'f64'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = fadd double undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = fsub double undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = fmul double undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %c = fadd double undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %d = fsub double undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %e = fmul double undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'f64'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = fadd double undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = fsub double undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = fmul double undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %c = fadd double undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %d = fsub double undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %e = fmul double undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %c = fadd double undef, undef
   %d = fsub double undef, undef
@@ -63,28 +63,28 @@ define void @f64() {
 
 define void @vf32() {
 ; CHECK-MVE-LABEL: 'vf32'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %c2 = fadd <2 x float> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %d2 = fsub <2 x float> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %e2 = fmul <2 x float> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %c4 = fadd <4 x float> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %d4 = fsub <4 x float> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %e4 = fmul <4 x float> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %c8 = fadd <8 x float> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %d8 = fsub <8 x float> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %e8 = fmul <8 x float> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 4 for: %c2 = fadd <2 x float> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 4 for: %d2 = fsub <2 x float> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 4 for: %e2 = fmul <2 x float> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 8 for: %c4 = fadd <4 x float> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 8 for: %d4 = fsub <4 x float> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 8 for: %e4 = fmul <4 x float> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %c8 = fadd <8 x float> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %d8 = fsub <8 x float> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %e8 = fmul <8 x float> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'vf32'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %c2 = fadd <2 x float> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %d2 = fsub <2 x float> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e2 = fmul <2 x float> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %c4 = fadd <4 x float> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %d4 = fsub <4 x float> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e4 = fmul <4 x float> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %c8 = fadd <8 x float> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %d8 = fsub <8 x float> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %e8 = fmul <8 x float> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %c2 = fadd <2 x float> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %d2 = fsub <2 x float> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %e2 = fmul <2 x float> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %c4 = fadd <4 x float> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %d4 = fsub <4 x float> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %e4 = fmul <4 x float> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %c8 = fadd <8 x float> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %d8 = fsub <8 x float> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %e8 = fmul <8 x float> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %c2 = fadd <2 x float> undef, undef
   %d2 = fsub <2 x float> undef, undef
@@ -100,28 +100,28 @@ define void @vf32() {
 
 define void @vf16() {
 ; CHECK-MVE-LABEL: 'vf16'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %c2 = fadd <2 x half> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %d2 = fsub <2 x half> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %e2 = fmul <2 x half> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %c4 = fadd <4 x half> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %d4 = fsub <4 x half> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %e4 = fmul <4 x half> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %c8 = fadd <8 x half> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %d8 = fsub <8 x half> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %e8 = fmul <8 x half> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 4 for: %c2 = fadd <2 x half> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 4 for: %d2 = fsub <2 x half> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 4 for: %e2 = fmul <2 x half> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 8 for: %c4 = fadd <4 x half> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 8 for: %d4 = fsub <4 x half> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 8 for: %e4 = fmul <4 x half> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %c8 = fadd <8 x half> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %d8 = fsub <8 x half> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %e8 = fmul <8 x half> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'vf16'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %c2 = fadd <2 x half> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %d2 = fsub <2 x half> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e2 = fmul <2 x half> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %c4 = fadd <4 x half> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %d4 = fsub <4 x half> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e4 = fmul <4 x half> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %c8 = fadd <8 x half> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %d8 = fsub <8 x half> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e8 = fmul <8 x half> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %c2 = fadd <2 x half> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %d2 = fsub <2 x half> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %e2 = fmul <2 x half> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %c4 = fadd <4 x half> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %d4 = fsub <4 x half> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %e4 = fmul <4 x half> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %c8 = fadd <8 x half> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %d8 = fsub <8 x half> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %e8 = fmul <8 x half> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %c2 = fadd <2 x half> undef, undef
   %d2 = fsub <2 x half> undef, undef
@@ -137,28 +137,28 @@ define void @vf16() {
 
 define void @vf64() {
 ; CHECK-MVE-LABEL: 'vf64'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %c2 = fadd <2 x double> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %d2 = fsub <2 x double> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %e2 = fmul <2 x double> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %c4 = fadd <4 x double> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %d4 = fsub <4 x double> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %e4 = fmul <4 x double> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %c8 = fadd <8 x double> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %d8 = fsub <8 x double> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %e8 = fmul <8 x double> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 4 for: %c2 = fadd <2 x double> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 4 for: %d2 = fsub <2 x double> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 4 for: %e2 = fmul <2 x double> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 8 for: %c4 = fadd <4 x double> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 8 for: %d4 = fsub <4 x double> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 8 for: %e4 = fmul <4 x double> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %c8 = fadd <8 x double> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %d8 = fsub <8 x double> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %e8 = fmul <8 x double> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'vf64'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %c2 = fadd <2 x double> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %d2 = fsub <2 x double> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %e2 = fmul <2 x double> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %c4 = fadd <4 x double> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %d4 = fsub <4 x double> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %e4 = fmul <4 x double> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %c8 = fadd <8 x double> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %d8 = fsub <8 x double> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %e8 = fmul <8 x double> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 4 for: %c2 = fadd <2 x double> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 4 for: %d2 = fsub <2 x double> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 4 for: %e2 = fmul <2 x double> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 8 for: %c4 = fadd <4 x double> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 8 for: %d4 = fsub <4 x double> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 8 for: %e4 = fmul <4 x double> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 16 for: %c8 = fadd <8 x double> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 16 for: %d8 = fsub <8 x double> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 16 for: %e8 = fmul <8 x double> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %c2 = fadd <2 x double> undef, undef
   %d2 = fsub <2 x double> undef, undef
diff --git a/llvm/test/Analysis/CostModel/ARM/fptoi_sat.ll b/llvm/test/Analysis/CostModel/ARM/fptoi_sat.ll
index aff7b19..af548f6 100644
--- a/llvm/test/Analysis/CostModel/ARM/fptoi_sat.ll
+++ b/llvm/test/Analysis/CostModel/ARM/fptoi_sat.ll
@@ -1,213 +1,213 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp,+fp64 < %s | FileCheck %s --check-prefix=CHECK-MVEFP
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp,+fp64 < %s | FileCheck %s --check-prefix=CHECK-MVEFP
 
 define void @casts() {
 ; CHECK-MVE-LABEL: 'casts'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %f32s1 = call i1 @llvm.fptosi.sat.i1.f32(float undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %f32u1 = call i1 @llvm.fptoui.sat.i1.f32(float undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %f32s8 = call i8 @llvm.fptosi.sat.i8.f32(float undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %f32u8 = call i8 @llvm.fptoui.sat.i8.f32(float undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %f32s16 = call i16 @llvm.fptosi.sat.i16.f32(float undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %f32u16 = call i16 @llvm.fptoui.sat.i16.f32(float undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %f32s32 = call i32 @llvm.fptosi.sat.i32.f32(float undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %f32u32 = call i32 @llvm.fptoui.sat.i32.f32(float undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %f32s64 = call i64 @llvm.fptosi.sat.i64.f32(float undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %f32u64 = call i64 @llvm.fptoui.sat.i64.f32(float undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %f64s1 = call i1 @llvm.fptosi.sat.i1.f64(double undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %f64u1 = call i1 @llvm.fptoui.sat.i1.f64(double undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %f64s8 = call i8 @llvm.fptosi.sat.i8.f64(double undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %f64u8 = call i8 @llvm.fptoui.sat.i8.f64(double undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %f64s16 = call i16 @llvm.fptosi.sat.i16.f64(double undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %f64u16 = call i16 @llvm.fptoui.sat.i16.f64(double undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %f64s32 = call i32 @llvm.fptosi.sat.i32.f64(double undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %f64u32 = call i32 @llvm.fptoui.sat.i32.f64(double undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %f64s64 = call i64 @llvm.fptosi.sat.i64.f64(double undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %f64u64 = call i64 @llvm.fptoui.sat.i64.f64(double undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %v2f32s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f32(<2 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v2f32u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f32(<2 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v2f32s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f32(<2 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v2f32u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f32(<2 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v2f32s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f32(<2 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v2f32u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f32(<2 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v2f32s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f32(<2 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v2f32u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f32(<2 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %v2f32s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f32(<2 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %v2f32u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f32(<2 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %v2f64s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f64(<2 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v2f64u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f64(<2 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v2f64s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f64(<2 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v2f64u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f64(<2 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v2f64s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f64(<2 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v2f64u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f64(<2 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v2f64s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v2f64u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %v2f64s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f64(<2 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %v2f64u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f64(<2 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %v4f32s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f32(<4 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v4f32u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f32(<4 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %v4f32s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f32(<4 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v4f32u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f32(<4 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %v4f32s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f32(<4 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v4f32u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f32(<4 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %v4f32s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v4f32u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 406 for instruction: %v4f32s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f32(<4 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 378 for instruction: %v4f32u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f32(<4 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 314 for instruction: %v4f64s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f64(<4 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 250 for instruction: %v4f64u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f64(<4 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %v4f64s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f64(<4 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 250 for instruction: %v4f64u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f64(<4 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %v4f64s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f64(<4 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 250 for instruction: %v4f64u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f64(<4 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %v4f64s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f64(<4 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 250 for instruction: %v4f64u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f64(<4 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 404 for instruction: %v4f64s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f64(<4 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 376 for instruction: %v4f64u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f64(<4 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 626 for instruction: %v8f32s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f32(<8 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 498 for instruction: %v8f32u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f32(<8 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 548 for instruction: %v8f32s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f32(<8 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 498 for instruction: %v8f32u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f32(<8 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 548 for instruction: %v8f32s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f32(<8 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 498 for instruction: %v8f32u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f32(<8 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 548 for instruction: %v8f32s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f32(<8 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 496 for instruction: %v8f32u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f32(<8 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1360 for instruction: %v8f32s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f32(<8 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1304 for instruction: %v8f32u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f32(<8 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 922 for instruction: %v8f64s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f64(<8 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 794 for instruction: %v8f64u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f64(<8 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 844 for instruction: %v8f64s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f64(<8 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 794 for instruction: %v8f64u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f64(<8 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 844 for instruction: %v8f64s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f64(<8 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 794 for instruction: %v8f64u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f64(<8 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 844 for instruction: %v8f64s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f64(<8 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 792 for instruction: %v8f64u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f64(<8 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1352 for instruction: %v8f64s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f64(<8 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1296 for instruction: %v8f64u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f64(<8 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1834 for instruction: %v16f32s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f32(<16 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1578 for instruction: %v16f32u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f32(<16 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1676 for instruction: %v16f32s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f32(<16 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1578 for instruction: %v16f32u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f32(<16 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1676 for instruction: %v16f32s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f32(<16 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1576 for instruction: %v16f32u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f32(<16 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1672 for instruction: %v16f32s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f32(<16 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1568 for instruction: %v16f32u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f32(<16 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4912 for instruction: %v16f32s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f32(<16 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4800 for instruction: %v16f32u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f32(<16 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 3018 for instruction: %v16f64s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f64(<16 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2762 for instruction: %v16f64u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f64(<16 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2860 for instruction: %v16f64s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f64(<16 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2762 for instruction: %v16f64u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f64(<16 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2860 for instruction: %v16f64s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f64(<16 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2760 for instruction: %v16f64u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f64(<16 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2856 for instruction: %v16f64s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f64(<16 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2752 for instruction: %v16f64u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f64(<16 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4880 for instruction: %v16f64s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f64(<16 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4768 for instruction: %v16f64u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f64(<16 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:7 Lat:23 SizeLat:23 for: %f32s1 = call i1 @llvm.fptosi.sat.i1.f32(float undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f32u1 = call i1 @llvm.fptoui.sat.i1.f32(float undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:6 Lat:23 SizeLat:23 for: %f32s8 = call i8 @llvm.fptosi.sat.i8.f32(float undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f32u8 = call i8 @llvm.fptoui.sat.i8.f32(float undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:6 Lat:23 SizeLat:23 for: %f32s16 = call i16 @llvm.fptosi.sat.i16.f32(float undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f32u16 = call i16 @llvm.fptoui.sat.i16.f32(float undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:6 Lat:23 SizeLat:23 for: %f32s32 = call i32 @llvm.fptosi.sat.i32.f32(float undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f32u32 = call i32 @llvm.fptoui.sat.i32.f32(float undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:7 Lat:24 SizeLat:24 for: %f32s64 = call i64 @llvm.fptosi.sat.i64.f32(float undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f32u64 = call i64 @llvm.fptoui.sat.i64.f32(float undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:7 Lat:23 SizeLat:23 for: %f64s1 = call i1 @llvm.fptosi.sat.i1.f64(double undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f64u1 = call i1 @llvm.fptoui.sat.i1.f64(double undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:6 Lat:23 SizeLat:23 for: %f64s8 = call i8 @llvm.fptosi.sat.i8.f64(double undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f64u8 = call i8 @llvm.fptoui.sat.i8.f64(double undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:6 Lat:23 SizeLat:23 for: %f64s16 = call i16 @llvm.fptosi.sat.i16.f64(double undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f64u16 = call i16 @llvm.fptoui.sat.i16.f64(double undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:6 Lat:23 SizeLat:23 for: %f64s32 = call i32 @llvm.fptosi.sat.i32.f64(double undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f64u32 = call i32 @llvm.fptoui.sat.i32.f64(double undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:7 Lat:24 SizeLat:24 for: %f64s64 = call i64 @llvm.fptosi.sat.i64.f64(double undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f64u64 = call i64 @llvm.fptoui.sat.i64.f64(double undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:120 CodeSize:43 Lat:85 SizeLat:85 for: %v2f32s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f32(<2 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f32u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f32(<2 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:102 CodeSize:30 Lat:67 SizeLat:67 for: %v2f32s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f32(<2 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f32u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f32(<2 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:102 CodeSize:30 Lat:67 SizeLat:67 for: %v2f32s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f32(<2 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f32u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f32(<2 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:102 CodeSize:30 Lat:67 SizeLat:67 for: %v2f32s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f32(<2 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f32u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f32(<2 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:134 CodeSize:30 Lat:67 SizeLat:67 for: %v2f32s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f32(<2 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:120 CodeSize:17 Lat:53 SizeLat:53 for: %v2f32u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f32(<2 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:120 CodeSize:43 Lat:85 SizeLat:85 for: %v2f64s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f64(<2 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f64u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f64(<2 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:102 CodeSize:30 Lat:67 SizeLat:67 for: %v2f64s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f64(<2 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f64u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f64(<2 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:102 CodeSize:30 Lat:67 SizeLat:67 for: %v2f64s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f64(<2 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f64u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f64(<2 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:102 CodeSize:30 Lat:67 SizeLat:67 for: %v2f64s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f64u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:134 CodeSize:30 Lat:67 SizeLat:67 for: %v2f64s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f64(<2 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:120 CodeSize:17 Lat:53 SizeLat:53 for: %v2f64u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f64(<2 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:240 CodeSize:85 Lat:169 SizeLat:169 for: %v4f32s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f32(<4 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:176 CodeSize:33 Lat:105 SizeLat:105 for: %v4f32u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f32(<4 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:202 CodeSize:58 Lat:131 SizeLat:131 for: %v4f32s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f32(<4 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:176 CodeSize:33 Lat:105 SizeLat:105 for: %v4f32u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f32(<4 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:202 CodeSize:58 Lat:131 SizeLat:131 for: %v4f32s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f32(<4 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:176 CodeSize:33 Lat:105 SizeLat:105 for: %v4f32u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f32(<4 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:202 CodeSize:58 Lat:131 SizeLat:131 for: %v4f32s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:176 CodeSize:33 Lat:105 SizeLat:105 for: %v4f32u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:406 CodeSize:59 Lat:133 SizeLat:133 for: %v4f32s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f32(<4 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:378 CodeSize:33 Lat:105 SizeLat:105 for: %v4f32u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f32(<4 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:314 CodeSize:85 Lat:169 SizeLat:169 for: %v4f64s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f64(<4 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:250 CodeSize:33 Lat:105 SizeLat:105 for: %v4f64u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f64(<4 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:276 CodeSize:58 Lat:131 SizeLat:131 for: %v4f64s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f64(<4 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:250 CodeSize:33 Lat:105 SizeLat:105 for: %v4f64u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f64(<4 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:276 CodeSize:58 Lat:131 SizeLat:131 for: %v4f64s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f64(<4 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:250 CodeSize:33 Lat:105 SizeLat:105 for: %v4f64u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f64(<4 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:276 CodeSize:58 Lat:131 SizeLat:131 for: %v4f64s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f64(<4 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:250 CodeSize:33 Lat:105 SizeLat:105 for: %v4f64u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f64(<4 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:404 CodeSize:59 Lat:133 SizeLat:133 for: %v4f64s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f64(<4 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:376 CodeSize:33 Lat:105 SizeLat:105 for: %v4f64u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f64(<4 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:626 CodeSize:169 Lat:337 SizeLat:337 for: %v8f32s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f32(<8 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:498 CodeSize:65 Lat:209 SizeLat:209 for: %v8f32u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f32(<8 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:548 CodeSize:114 Lat:259 SizeLat:259 for: %v8f32s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f32(<8 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:498 CodeSize:65 Lat:209 SizeLat:209 for: %v8f32u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f32(<8 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:548 CodeSize:114 Lat:259 SizeLat:259 for: %v8f32s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f32(<8 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:498 CodeSize:65 Lat:209 SizeLat:209 for: %v8f32u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f32(<8 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:548 CodeSize:115 Lat:261 SizeLat:261 for: %v8f32s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f32(<8 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:496 CodeSize:65 Lat:209 SizeLat:209 for: %v8f32u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f32(<8 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1360 CodeSize:117 Lat:265 SizeLat:265 for: %v8f32s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f32(<8 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1304 CodeSize:65 Lat:209 SizeLat:209 for: %v8f32u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f32(<8 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:922 CodeSize:169 Lat:337 SizeLat:337 for: %v8f64s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f64(<8 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:794 CodeSize:65 Lat:209 SizeLat:209 for: %v8f64u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f64(<8 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:844 CodeSize:114 Lat:259 SizeLat:259 for: %v8f64s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f64(<8 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:794 CodeSize:65 Lat:209 SizeLat:209 for: %v8f64u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f64(<8 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:844 CodeSize:114 Lat:259 SizeLat:259 for: %v8f64s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f64(<8 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:794 CodeSize:65 Lat:209 SizeLat:209 for: %v8f64u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f64(<8 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:844 CodeSize:115 Lat:261 SizeLat:261 for: %v8f64s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f64(<8 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:792 CodeSize:65 Lat:209 SizeLat:209 for: %v8f64u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f64(<8 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1352 CodeSize:117 Lat:265 SizeLat:265 for: %v8f64s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f64(<8 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1296 CodeSize:65 Lat:209 SizeLat:209 for: %v8f64u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f64(<8 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1834 CodeSize:337 Lat:673 SizeLat:673 for: %v16f32s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f32(<16 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1578 CodeSize:129 Lat:417 SizeLat:417 for: %v16f32u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f32(<16 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1676 CodeSize:226 Lat:515 SizeLat:515 for: %v16f32s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f32(<16 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1578 CodeSize:129 Lat:417 SizeLat:417 for: %v16f32u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f32(<16 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1676 CodeSize:227 Lat:517 SizeLat:517 for: %v16f32s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f32(<16 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1576 CodeSize:129 Lat:417 SizeLat:417 for: %v16f32u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f32(<16 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1672 CodeSize:229 Lat:521 SizeLat:521 for: %v16f32s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f32(<16 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1568 CodeSize:129 Lat:417 SizeLat:417 for: %v16f32u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f32(<16 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4912 CodeSize:233 Lat:529 SizeLat:529 for: %v16f32s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f32(<16 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4800 CodeSize:129 Lat:417 SizeLat:417 for: %v16f32u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f32(<16 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:3018 CodeSize:337 Lat:673 SizeLat:673 for: %v16f64s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f64(<16 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2762 CodeSize:129 Lat:417 SizeLat:417 for: %v16f64u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f64(<16 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2860 CodeSize:226 Lat:515 SizeLat:515 for: %v16f64s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f64(<16 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2762 CodeSize:129 Lat:417 SizeLat:417 for: %v16f64u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f64(<16 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2860 CodeSize:227 Lat:517 SizeLat:517 for: %v16f64s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f64(<16 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2760 CodeSize:129 Lat:417 SizeLat:417 for: %v16f64u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f64(<16 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2856 CodeSize:229 Lat:521 SizeLat:521 for: %v16f64s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f64(<16 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2752 CodeSize:129 Lat:417 SizeLat:417 for: %v16f64u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f64(<16 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4880 CodeSize:233 Lat:529 SizeLat:529 for: %v16f64s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f64(<16 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4768 CodeSize:129 Lat:417 SizeLat:417 for: %v16f64u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f64(<16 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'casts'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f32s1 = call i1 @llvm.fptosi.sat.i1.f32(float undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f32u1 = call i1 @llvm.fptoui.sat.i1.f32(float undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f32s8 = call i8 @llvm.fptosi.sat.i8.f32(float undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f32u8 = call i8 @llvm.fptoui.sat.i8.f32(float undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f32s16 = call i16 @llvm.fptosi.sat.i16.f32(float undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f32u16 = call i16 @llvm.fptoui.sat.i16.f32(float undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32s32 = call i32 @llvm.fptosi.sat.i32.f32(float undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32u32 = call i32 @llvm.fptoui.sat.i32.f32(float undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %f32s64 = call i64 @llvm.fptosi.sat.i64.f32(float undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f32u64 = call i64 @llvm.fptoui.sat.i64.f32(float undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %f64s1 = call i1 @llvm.fptosi.sat.i1.f64(double undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %f64u1 = call i1 @llvm.fptoui.sat.i1.f64(double undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %f64s8 = call i8 @llvm.fptosi.sat.i8.f64(double undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %f64u8 = call i8 @llvm.fptoui.sat.i8.f64(double undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %f64s16 = call i16 @llvm.fptosi.sat.i16.f64(double undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %f64u16 = call i16 @llvm.fptoui.sat.i16.f64(double undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64s32 = call i32 @llvm.fptosi.sat.i32.f64(double undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64u32 = call i32 @llvm.fptoui.sat.i32.f64(double undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %f64s64 = call i64 @llvm.fptosi.sat.i64.f64(double undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %f64u64 = call i64 @llvm.fptoui.sat.i64.f64(double undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f32s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f32u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f32s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f32u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f32s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f32u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %v2f32s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %v2f32u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v2f64s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f64(<2 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %v2f64u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f64(<2 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %v2f64s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f64(<2 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %v2f64u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f64(<2 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %v2f64s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f64(<2 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %v2f64u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f64(<2 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %v2f64s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %v2f64u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %v2f64s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f64(<2 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %v2f64u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f64(<2 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f32s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f32u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f32s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f32u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f32s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f32u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 284 for instruction: %v4f32s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 278 for instruction: %v4f32u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 234 for instruction: %v4f64s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f64(<4 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 178 for instruction: %v4f64u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f64(<4 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %v4f64s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f64(<4 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 178 for instruction: %v4f64u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f64(<4 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %v4f64s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f64(<4 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 178 for instruction: %v4f64u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f64(<4 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %v4f64s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f64(<4 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 178 for instruction: %v4f64u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f64(<4 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: %v4f64s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f64(<4 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 304 for instruction: %v4f64u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f64(<4 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f32s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f32(<8 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f32u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f32(<8 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f32s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f32(<8 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f32u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f32(<8 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f32s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f32(<8 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f32u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f32(<8 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f32(<8 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f32(<8 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1148 for instruction: %v8f32s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f32(<8 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1104 for instruction: %v8f32u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f32(<8 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 762 for instruction: %v8f64s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f64(<8 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 650 for instruction: %v8f64u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f64(<8 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 684 for instruction: %v8f64s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f64(<8 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 650 for instruction: %v8f64u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f64(<8 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 684 for instruction: %v8f64s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f64(<8 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 650 for instruction: %v8f64u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f64(<8 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 684 for instruction: %v8f64s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f64(<8 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 648 for instruction: %v8f64u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f64(<8 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1192 for instruction: %v8f64s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f64(<8 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1152 for instruction: %v8f64u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f64(<8 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f32s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f32(<16 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f32u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f32(<16 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f32s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f32(<16 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f32u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f32(<16 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f32s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f32(<16 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f32u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f32(<16 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f32s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f32(<16 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f32u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f32(<16 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4488 for instruction: %v16f32s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f32(<16 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4400 for instruction: %v16f32u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f32(<16 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2698 for instruction: %v16f64s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f64(<16 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2474 for instruction: %v16f64u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f64(<16 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2540 for instruction: %v16f64s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f64(<16 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2474 for instruction: %v16f64u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f64(<16 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2540 for instruction: %v16f64s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f64(<16 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2472 for instruction: %v16f64u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f64(<16 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2536 for instruction: %v16f64s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f64(<16 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2464 for instruction: %v16f64u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f64(<16 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4560 for instruction: %v16f64s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f64(<16 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4480 for instruction: %v16f64u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f64(<16 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f32s1 = call i1 @llvm.fptosi.sat.i1.f32(float undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f32u1 = call i1 @llvm.fptoui.sat.i1.f32(float undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f32s8 = call i8 @llvm.fptosi.sat.i8.f32(float undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f32u8 = call i8 @llvm.fptoui.sat.i8.f32(float undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f32s16 = call i16 @llvm.fptosi.sat.i16.f32(float undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f32u16 = call i16 @llvm.fptoui.sat.i16.f32(float undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %f32s32 = call i32 @llvm.fptosi.sat.i32.f32(float undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %f32u32 = call i32 @llvm.fptoui.sat.i32.f32(float undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %f32s64 = call i64 @llvm.fptosi.sat.i64.f32(float undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 3 for: %f32u64 = call i64 @llvm.fptoui.sat.i64.f32(float undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:7 SizeLat:7 for: %f64s1 = call i1 @llvm.fptosi.sat.i1.f64(double undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:7 SizeLat:7 for: %f64u1 = call i1 @llvm.fptoui.sat.i1.f64(double undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:7 SizeLat:7 for: %f64s8 = call i8 @llvm.fptosi.sat.i8.f64(double undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:7 SizeLat:7 for: %f64u8 = call i8 @llvm.fptoui.sat.i8.f64(double undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:7 SizeLat:7 for: %f64s16 = call i16 @llvm.fptosi.sat.i16.f64(double undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:7 SizeLat:7 for: %f64u16 = call i16 @llvm.fptoui.sat.i16.f64(double undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %f64s32 = call i32 @llvm.fptosi.sat.i32.f64(double undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %f64u32 = call i32 @llvm.fptoui.sat.i32.f64(double undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:7 SizeLat:7 for: %f64s64 = call i64 @llvm.fptosi.sat.i64.f64(double undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:7 SizeLat:7 for: %f64u64 = call i64 @llvm.fptoui.sat.i64.f64(double undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f32s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f32u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f32s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f32u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f32s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f32u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f32s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f32u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:76 CodeSize:5 Lat:9 SizeLat:9 for: %v2f32s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:72 CodeSize:3 Lat:5 SizeLat:5 for: %v2f32u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:80 CodeSize:35 Lat:45 SizeLat:45 for: %v2f64s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f64(<2 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:17 Lat:17 SizeLat:17 for: %v2f64u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f64(<2 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:62 CodeSize:22 Lat:27 SizeLat:27 for: %v2f64s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f64(<2 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:17 Lat:17 SizeLat:17 for: %v2f64u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f64(<2 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:62 CodeSize:22 Lat:27 SizeLat:27 for: %v2f64s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f64(<2 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:17 Lat:17 SizeLat:17 for: %v2f64u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f64(<2 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:62 CodeSize:22 Lat:27 SizeLat:27 for: %v2f64s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:17 Lat:17 SizeLat:17 for: %v2f64u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:94 CodeSize:22 Lat:27 SizeLat:27 for: %v2f64s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f64(<2 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:84 CodeSize:17 Lat:17 SizeLat:17 for: %v2f64u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f64(<2 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f32s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f32u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f32s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f32u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f32s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f32u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f32s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f32u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:284 CodeSize:6 Lat:11 SizeLat:11 for: %v4f32s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:278 CodeSize:3 Lat:5 SizeLat:5 for: %v4f32u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:234 CodeSize:69 Lat:89 SizeLat:89 for: %v4f64s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f64(<4 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:178 CodeSize:33 Lat:33 SizeLat:33 for: %v4f64u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f64(<4 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:196 CodeSize:42 Lat:51 SizeLat:51 for: %v4f64s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f64(<4 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:178 CodeSize:33 Lat:33 SizeLat:33 for: %v4f64u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f64(<4 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:196 CodeSize:42 Lat:51 SizeLat:51 for: %v4f64s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f64(<4 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:178 CodeSize:33 Lat:33 SizeLat:33 for: %v4f64u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f64(<4 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:196 CodeSize:42 Lat:51 SizeLat:51 for: %v4f64s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f64(<4 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:178 CodeSize:33 Lat:33 SizeLat:33 for: %v4f64u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f64(<4 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:324 CodeSize:43 Lat:53 SizeLat:53 for: %v4f64s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f64(<4 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:304 CodeSize:33 Lat:33 SizeLat:33 for: %v4f64u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f64(<4 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v8f32s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v8f32u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v8f32s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v8f32u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v8f32s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v8f32u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v8f32s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v8f32u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:1148 CodeSize:43 Lat:53 SizeLat:53 for: %v8f32s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:1104 CodeSize:5 Lat:9 SizeLat:9 for: %v8f32u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:762 CodeSize:137 Lat:177 SizeLat:177 for: %v8f64s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f64(<8 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:650 CodeSize:65 Lat:65 SizeLat:65 for: %v8f64u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f64(<8 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:684 CodeSize:82 Lat:99 SizeLat:99 for: %v8f64s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f64(<8 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:650 CodeSize:65 Lat:65 SizeLat:65 for: %v8f64u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f64(<8 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:684 CodeSize:82 Lat:99 SizeLat:99 for: %v8f64s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f64(<8 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:650 CodeSize:65 Lat:65 SizeLat:65 for: %v8f64u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f64(<8 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:684 CodeSize:83 Lat:101 SizeLat:101 for: %v8f64s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f64(<8 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:648 CodeSize:65 Lat:65 SizeLat:65 for: %v8f64u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f64(<8 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:1192 CodeSize:85 Lat:105 SizeLat:105 for: %v8f64s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f64(<8 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:1152 CodeSize:65 Lat:65 SizeLat:65 for: %v8f64u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f64(<8 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:28 Lat:24 SizeLat:24 for: %v16f32s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f32(<16 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:28 Lat:24 SizeLat:24 for: %v16f32u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f32(<16 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:28 Lat:24 SizeLat:24 for: %v16f32s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f32(<16 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:28 Lat:24 SizeLat:24 for: %v16f32u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f32(<16 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:28 Lat:24 SizeLat:24 for: %v16f32s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f32(<16 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:28 Lat:24 SizeLat:24 for: %v16f32u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f32(<16 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v16f32s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f32(<16 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v16f32u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f32(<16 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:4488 CodeSize:85 Lat:105 SizeLat:105 for: %v16f32s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f32(<16 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:4400 CodeSize:9 Lat:17 SizeLat:17 for: %v16f32u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f32(<16 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2698 CodeSize:273 Lat:353 SizeLat:353 for: %v16f64s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f64(<16 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2474 CodeSize:129 Lat:129 SizeLat:129 for: %v16f64u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f64(<16 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2540 CodeSize:162 Lat:195 SizeLat:195 for: %v16f64s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f64(<16 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2474 CodeSize:129 Lat:129 SizeLat:129 for: %v16f64u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f64(<16 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2540 CodeSize:163 Lat:197 SizeLat:197 for: %v16f64s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f64(<16 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2472 CodeSize:129 Lat:129 SizeLat:129 for: %v16f64u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f64(<16 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2536 CodeSize:165 Lat:201 SizeLat:201 for: %v16f64s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f64(<16 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2464 CodeSize:129 Lat:129 SizeLat:129 for: %v16f64u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f64(<16 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:4560 CodeSize:169 Lat:209 SizeLat:209 for: %v16f64s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f64(<16 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:4480 CodeSize:129 Lat:129 SizeLat:129 for: %v16f64u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f64(<16 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %f32s1 = call i1 @llvm.fptosi.sat.i1.f32(float undef)
   %f32u1 = call i1 @llvm.fptoui.sat.i1.f32(float undef)
@@ -324,110 +324,110 @@ define void @casts() {
 
 define void @fp16() {
 ; CHECK-MVE-LABEL: 'fp16'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 406 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 378 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 480 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 402 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 402 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 550 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 498 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1362 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1306 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1250 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 994 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1092 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 994 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1092 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 992 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1680 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1576 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4920 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4808 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:7 Lat:23 SizeLat:23 for: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:6 Lat:23 SizeLat:23 for: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:6 Lat:23 SizeLat:23 for: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:6 Lat:23 SizeLat:23 for: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:7 Lat:24 SizeLat:24 for: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:120 CodeSize:43 Lat:85 SizeLat:85 for: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:102 CodeSize:30 Lat:67 SizeLat:67 for: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:102 CodeSize:30 Lat:67 SizeLat:67 for: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:102 CodeSize:30 Lat:67 SizeLat:67 for: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:134 CodeSize:30 Lat:67 SizeLat:67 for: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:120 CodeSize:17 Lat:53 SizeLat:53 for: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:240 CodeSize:85 Lat:169 SizeLat:169 for: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:176 CodeSize:33 Lat:105 SizeLat:105 for: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:202 CodeSize:58 Lat:131 SizeLat:131 for: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:176 CodeSize:33 Lat:105 SizeLat:105 for: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:202 CodeSize:58 Lat:131 SizeLat:131 for: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:176 CodeSize:33 Lat:105 SizeLat:105 for: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:202 CodeSize:58 Lat:131 SizeLat:131 for: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:176 CodeSize:33 Lat:105 SizeLat:105 for: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:406 CodeSize:59 Lat:133 SizeLat:133 for: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:378 CodeSize:33 Lat:105 SizeLat:105 for: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:480 CodeSize:169 Lat:337 SizeLat:337 for: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:352 CodeSize:65 Lat:209 SizeLat:209 for: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:402 CodeSize:114 Lat:259 SizeLat:259 for: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:352 CodeSize:65 Lat:209 SizeLat:209 for: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:402 CodeSize:114 Lat:259 SizeLat:259 for: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:352 CodeSize:65 Lat:209 SizeLat:209 for: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:550 CodeSize:115 Lat:261 SizeLat:261 for: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:498 CodeSize:65 Lat:209 SizeLat:209 for: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1362 CodeSize:117 Lat:265 SizeLat:265 for: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1306 CodeSize:65 Lat:209 SizeLat:209 for: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1250 CodeSize:337 Lat:673 SizeLat:673 for: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:994 CodeSize:129 Lat:417 SizeLat:417 for: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1092 CodeSize:226 Lat:515 SizeLat:515 for: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:994 CodeSize:129 Lat:417 SizeLat:417 for: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1092 CodeSize:227 Lat:517 SizeLat:517 for: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:992 CodeSize:129 Lat:417 SizeLat:417 for: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1680 CodeSize:229 Lat:521 SizeLat:521 for: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1576 CodeSize:129 Lat:417 SizeLat:417 for: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4920 CodeSize:233 Lat:529 SizeLat:529 for: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4808 CodeSize:129 Lat:417 SizeLat:417 for: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'fp16'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 284 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 278 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1112 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1102 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4484 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4400 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 3 for: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:5 Lat:9 SizeLat:9 for: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:3 Lat:5 SizeLat:5 for: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:76 CodeSize:5 Lat:9 SizeLat:9 for: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:72 CodeSize:3 Lat:5 SizeLat:5 for: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:5 Lat:9 SizeLat:9 for: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:5 SizeLat:5 for: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:284 CodeSize:6 Lat:11 SizeLat:11 for: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:278 CodeSize:3 Lat:5 SizeLat:5 for: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:6 Lat:11 SizeLat:11 for: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:3 Lat:5 SizeLat:5 for: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:1112 CodeSize:8 Lat:15 SizeLat:15 for: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:1102 CodeSize:3 Lat:5 SizeLat:5 for: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:124 CodeSize:75 Lat:85 SizeLat:85 for: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:5 Lat:9 SizeLat:9 for: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:4484 CodeSize:79 Lat:93 SizeLat:93 for: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:4400 CodeSize:5 Lat:9 SizeLat:9 for: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
   %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
diff --git a/llvm/test/Analysis/CostModel/ARM/freeshift.ll b/llvm/test/Analysis/CostModel/ARM/freeshift.ll
index 51e87b5..cd5c8c5 100644
--- a/llvm/test/Analysis/CostModel/ARM/freeshift.ll
+++ b/llvm/test/Analysis/CostModel/ARM/freeshift.ll
@@ -1,23 +1,23 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi < %s | FileCheck %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi < %s | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define void @shl(i32 %a, i32 %b) {
 ; CHECK-LABEL: 'shl'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %as = shl i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ac = add i32 %b, %as
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ss = shl i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sc = sub i32 %b, %ss
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %xs = shl i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xc = xor i32 %b, %xs
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ns = shl i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nc = and i32 %b, %ns
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %os = shl i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %oc = or i32 %b, %os
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %is = shl i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ic = icmp eq i32 %b, %is
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %as = shl i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ac = add i32 %b, %as
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %ss = shl i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %sc = sub i32 %b, %ss
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %xs = shl i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %xc = xor i32 %b, %xs
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %ns = shl i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nc = and i32 %b, %ns
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %os = shl i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %oc = or i32 %b, %os
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %is = shl i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ic = icmp eq i32 %b, %is
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
   %as = shl i32 %a, 3
   %ac = add i32 %b, %as
@@ -36,19 +36,19 @@ define void @shl(i32 %a, i32 %b) {
 
 define void @ashr(i32 %a, i32 %b) {
 ; CHECK-LABEL: 'ashr'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %as = ashr i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ac = add i32 %b, %as
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ss = ashr i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sc = sub i32 %b, %ss
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %xs = ashr i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xc = xor i32 %b, %xs
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ns = ashr i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nc = and i32 %b, %ns
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %os = ashr i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %oc = or i32 %b, %os
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %is = ashr i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ic = icmp eq i32 %b, %is
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %as = ashr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ac = add i32 %b, %as
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %ss = ashr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %sc = sub i32 %b, %ss
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %xs = ashr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %xc = xor i32 %b, %xs
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %ns = ashr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nc = and i32 %b, %ns
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %os = ashr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %oc = or i32 %b, %os
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %is = ashr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ic = icmp eq i32 %b, %is
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
   %as = ashr i32 %a, 3
   %ac = add i32 %b, %as
@@ -67,19 +67,19 @@ define void @ashr(i32 %a, i32 %b) {
 
 define void @lshr(i32 %a, i32 %b) {
 ; CHECK-LABEL: 'lshr'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %as = lshr i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ac = add i32 %b, %as
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ss = lshr i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sc = sub i32 %b, %ss
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %xs = lshr i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xc = xor i32 %b, %xs
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ns = lshr i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nc = and i32 %b, %ns
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %os = lshr i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %oc = or i32 %b, %os
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %is = lshr i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ic = icmp eq i32 %b, %is
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %as = lshr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ac = add i32 %b, %as
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %ss = lshr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %sc = sub i32 %b, %ss
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %xs = lshr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %xc = xor i32 %b, %xs
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %ns = lshr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nc = and i32 %b, %ns
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %os = lshr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %oc = or i32 %b, %os
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %is = lshr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ic = icmp eq i32 %b, %is
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
   %as = lshr i32 %a, 3
   %ac = add i32 %b, %as
diff --git a/llvm/test/Analysis/CostModel/ARM/gep.ll b/llvm/test/Analysis/CostModel/ARM/gep.ll
index 48de193..cce87a5 100644
--- a/llvm/test/Analysis/CostModel/ARM/gep.ll
+++ b/llvm/test/Analysis/CostModel/ARM/gep.ll
@@ -1,98 +1,98 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv6m-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V6M
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7m-none-eabi -mcpu=cortex-m3 < %s | FileCheck %s --check-prefix=CHECK-V7M-NOFP
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7m-none-eabi -mcpu=cortex-m4 < %s | FileCheck %s --check-prefix=CHECK-V7M-FP
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVEFP
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift < %s | FileCheck %s --check-prefix=CHECK-T32
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=arm-none-eabi -mcpu=cortex-a53 < %s | FileCheck %s --check-prefix=CHECK-A32
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv6m-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V6M
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7m-none-eabi -mcpu=cortex-m3 < %s | FileCheck %s --check-prefix=CHECK-V7M-NOFP
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7m-none-eabi -mcpu=cortex-m4 < %s | FileCheck %s --check-prefix=CHECK-V7M-FP
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVEFP
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift < %s | FileCheck %s --check-prefix=CHECK-T32
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=arm-none-eabi -mcpu=cortex-a53 < %s | FileCheck %s --check-prefix=CHECK-A32
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define void @testi8(ptr %a, i32 %i) {
 ; CHECK-V6M-LABEL: 'testi8'
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i8, ptr %a, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i8, ptr %a, i32 31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a32 = getelementptr inbounds i8, ptr %a, i32 32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i8, ptr %a, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i8, ptr %a, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i8, ptr %a, i32 31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a32 = getelementptr inbounds i8, ptr %a, i32 32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i8, ptr %a, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-NOFP-LABEL: 'testi8'
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i8, ptr %a, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i8, ptr %a, i32 31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i8, ptr %a, i32 32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i8, ptr %a, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i8, ptr %a, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i8, ptr %a, i32 31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i8, ptr %a, i32 32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i8, ptr %a, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-FP-LABEL: 'testi8'
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i8, ptr %a, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i8, ptr %a, i32 31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i8, ptr %a, i32 32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i8, ptr %a, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i8, ptr %a, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i8, ptr %a, i32 31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i8, ptr %a, i32 32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i8, ptr %a, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-MVE-LABEL: 'testi8'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i8, ptr %a, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i8, ptr %a, i32 31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i8, ptr %a, i32 32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i8, ptr %a, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i8, ptr %a, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i8, ptr %a, i32 31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i8, ptr %a, i32 32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i8, ptr %a, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'testi8'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i8, ptr %a, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i8, ptr %a, i32 31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i8, ptr %a, i32 32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i8, ptr %a, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i8, ptr %a, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i8, ptr %a, i32 31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i8, ptr %a, i32 32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i8, ptr %a, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-T32-LABEL: 'testi8'
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i8, ptr %a, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i8, ptr %a, i32 31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i8, ptr %a, i32 32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i8, ptr %a, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i8, ptr %a, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i8, ptr %a, i32 31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i8, ptr %a, i32 32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i8, ptr %a, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-A32-LABEL: 'testi8'
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i8, ptr %a, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i8, ptr %a, i32 31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i8, ptr %a, i32 32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i8, ptr %a, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i8, ptr %a, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i8, ptr %a, i32 31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i8, ptr %a, i32 32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i8, ptr %a, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %a1 = getelementptr inbounds i8, ptr %a, i32 1
   %am4 = getelementptr inbounds i8, ptr %a, i32 -1
@@ -109,88 +109,88 @@ define void @testi8(ptr %a, i32 %i) {
 
 define void @testi16(ptr %a, i32 %i) {
 ; CHECK-V6M-LABEL: 'testi16'
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i16, ptr %a, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i16, ptr %a, i32 31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a32 = getelementptr inbounds i16, ptr %a, i32 32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i16, ptr %a, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i16, ptr %a, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i16, ptr %a, i32 31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a32 = getelementptr inbounds i16, ptr %a, i32 32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i16, ptr %a, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-NOFP-LABEL: 'testi16'
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i16, ptr %a, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i16, ptr %a, i32 31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i16, ptr %a, i32 32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i16, ptr %a, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i16, ptr %a, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i16, ptr %a, i32 31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i16, ptr %a, i32 32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i16, ptr %a, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-FP-LABEL: 'testi16'
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i16, ptr %a, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i16, ptr %a, i32 31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i16, ptr %a, i32 32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i16, ptr %a, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i16, ptr %a, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i16, ptr %a, i32 31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i16, ptr %a, i32 32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i16, ptr %a, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-MVE-LABEL: 'testi16'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i16, ptr %a, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i16, ptr %a, i32 31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i16, ptr %a, i32 32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i16, ptr %a, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i16, ptr %a, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i16, ptr %a, i32 31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i16, ptr %a, i32 32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i16, ptr %a, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'testi16'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i16, ptr %a, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i16, ptr %a, i32 31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i16, ptr %a, i32 32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i16, ptr %a, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i16, ptr %a, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i16, ptr %a, i32 31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i16, ptr %a, i32 32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i16, ptr %a, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-T32-LABEL: 'testi16'
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i16, ptr %a, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i16, ptr %a, i32 31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i16, ptr %a, i32 32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i16, ptr %a, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i16, ptr %a, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i16, ptr %a, i32 31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i16, ptr %a, i32 32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i16, ptr %a, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-A32-LABEL: 'testi16'
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i16, ptr %a, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i16, ptr %a, i32 31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i16, ptr %a, i32 32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i16, ptr %a, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i16, ptr %a, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i16, ptr %a, i32 31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i16, ptr %a, i32 32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i16, ptr %a, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %a1 = getelementptr inbounds i16, ptr %a, i32 1
   %am4 = getelementptr inbounds i16, ptr %a, i32 -1
@@ -207,88 +207,88 @@ define void @testi16(ptr %a, i32 %i) {
 
 define void @testi32(ptr %a, i32 %i) {
 ; CHECK-V6M-LABEL: 'testi32'
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i32, ptr %a, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i32, ptr %a, i32 31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a32 = getelementptr inbounds i32, ptr %a, i32 32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i32, ptr %a, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i32, ptr %a, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i32, ptr %a, i32 31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a32 = getelementptr inbounds i32, ptr %a, i32 32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i32, ptr %a, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-NOFP-LABEL: 'testi32'
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i32, ptr %a, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i32, ptr %a, i32 31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i32, ptr %a, i32 32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i32, ptr %a, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i32, ptr %a, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i32, ptr %a, i32 31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i32, ptr %a, i32 32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i32, ptr %a, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-FP-LABEL: 'testi32'
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i32, ptr %a, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i32, ptr %a, i32 31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i32, ptr %a, i32 32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i32, ptr %a, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i32, ptr %a, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i32, ptr %a, i32 31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i32, ptr %a, i32 32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i32, ptr %a, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-MVE-LABEL: 'testi32'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i32, ptr %a, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i32, ptr %a, i32 31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i32, ptr %a, i32 32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i32, ptr %a, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i32, ptr %a, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i32, ptr %a, i32 31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i32, ptr %a, i32 32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i32, ptr %a, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'testi32'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i32, ptr %a, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i32, ptr %a, i32 31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i32, ptr %a, i32 32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i32, ptr %a, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i32, ptr %a, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i32, ptr %a, i32 31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i32, ptr %a, i32 32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i32, ptr %a, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-T32-LABEL: 'testi32'
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i32, ptr %a, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i32, ptr %a, i32 31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i32, ptr %a, i32 32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i32, ptr %a, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i32, ptr %a, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i32, ptr %a, i32 31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i32, ptr %a, i32 32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i32, ptr %a, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-A32-LABEL: 'testi32'
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i32, ptr %a, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i32, ptr %a, i32 31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i32, ptr %a, i32 32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i32, ptr %a, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i32, ptr %a, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i32, ptr %a, i32 31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i32, ptr %a, i32 32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i32, ptr %a, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %a1 = getelementptr inbounds i32, ptr %a, i32 1
   %am4 = getelementptr inbounds i32, ptr %a, i32 -1
@@ -305,102 +305,102 @@ define void @testi32(ptr %a, i32 %i) {
 
 define void @testi64(ptr %a, i32 %i) {
 ; CHECK-V6M-LABEL: 'testi64'
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i64, ptr %a, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a15 = getelementptr inbounds i64, ptr %a, i32 15
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a16 = getelementptr inbounds i64, ptr %a, i32 16
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a31 = getelementptr inbounds i64, ptr %a, i32 31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a32 = getelementptr inbounds i64, ptr %a, i32 32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i64, ptr %a, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i64, ptr %a, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %a15 = getelementptr inbounds i64, ptr %a, i32 15
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a16 = getelementptr inbounds i64, ptr %a, i32 16
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a31 = getelementptr inbounds i64, ptr %a, i32 31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a32 = getelementptr inbounds i64, ptr %a, i32 32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i64, ptr %a, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-NOFP-LABEL: 'testi64'
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i64, ptr %a, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a15 = getelementptr inbounds i64, ptr %a, i32 15
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a16 = getelementptr inbounds i64, ptr %a, i32 16
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i64, ptr %a, i32 31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i64, ptr %a, i32 32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i64, ptr %a, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i64, ptr %a, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a15 = getelementptr inbounds i64, ptr %a, i32 15
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a16 = getelementptr inbounds i64, ptr %a, i32 16
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i64, ptr %a, i32 31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i64, ptr %a, i32 32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i64, ptr %a, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-FP-LABEL: 'testi64'
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i64, ptr %a, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a15 = getelementptr inbounds i64, ptr %a, i32 15
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a16 = getelementptr inbounds i64, ptr %a, i32 16
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i64, ptr %a, i32 31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i64, ptr %a, i32 32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i64, ptr %a, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i64, ptr %a, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a15 = getelementptr inbounds i64, ptr %a, i32 15
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a16 = getelementptr inbounds i64, ptr %a, i32 16
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i64, ptr %a, i32 31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i64, ptr %a, i32 32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i64, ptr %a, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-MVE-LABEL: 'testi64'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i64, ptr %a, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a15 = getelementptr inbounds i64, ptr %a, i32 15
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a16 = getelementptr inbounds i64, ptr %a, i32 16
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i64, ptr %a, i32 31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i64, ptr %a, i32 32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i64, ptr %a, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i64, ptr %a, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a15 = getelementptr inbounds i64, ptr %a, i32 15
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a16 = getelementptr inbounds i64, ptr %a, i32 16
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i64, ptr %a, i32 31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i64, ptr %a, i32 32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i64, ptr %a, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'testi64'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i64, ptr %a, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a15 = getelementptr inbounds i64, ptr %a, i32 15
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a16 = getelementptr inbounds i64, ptr %a, i32 16
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i64, ptr %a, i32 31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i64, ptr %a, i32 32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i64, ptr %a, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i64, ptr %a, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a15 = getelementptr inbounds i64, ptr %a, i32 15
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a16 = getelementptr inbounds i64, ptr %a, i32 16
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i64, ptr %a, i32 31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i64, ptr %a, i32 32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i64, ptr %a, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-T32-LABEL: 'testi64'
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i64, ptr %a, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a15 = getelementptr inbounds i64, ptr %a, i32 15
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a16 = getelementptr inbounds i64, ptr %a, i32 16
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i64, ptr %a, i32 31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i64, ptr %a, i32 32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i64, ptr %a, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i64, ptr %a, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a15 = getelementptr inbounds i64, ptr %a, i32 15
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a16 = getelementptr inbounds i64, ptr %a, i32 16
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i64, ptr %a, i32 31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i64, ptr %a, i32 32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i64, ptr %a, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-A32-LABEL: 'testi64'
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1 = getelementptr inbounds i64, ptr %a, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a15 = getelementptr inbounds i64, ptr %a, i32 15
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a16 = getelementptr inbounds i64, ptr %a, i32 16
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a31 = getelementptr inbounds i64, ptr %a, i32 31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a32 = getelementptr inbounds i64, ptr %a, i32 32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i64, ptr %a, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a1 = getelementptr inbounds i64, ptr %a, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a15 = getelementptr inbounds i64, ptr %a, i32 15
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a16 = getelementptr inbounds i64, ptr %a, i32 16
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a31 = getelementptr inbounds i64, ptr %a, i32 31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a32 = getelementptr inbounds i64, ptr %a, i32 32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i64, ptr %a, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %a1 = getelementptr inbounds i64, ptr %a, i32 1
   %am4 = getelementptr inbounds i64, ptr %a, i32 -1
@@ -419,102 +419,102 @@ define void @testi64(ptr %a, i32 %i) {
 
 define void @testhalf(ptr %a, i32 %i) {
 ; CHECK-V6M-LABEL: 'testhalf'
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1 = getelementptr inbounds half, ptr %a, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am1 = getelementptr inbounds half, ptr %a, i32 -1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a255 = getelementptr inbounds half, ptr %a, i32 255
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds half, ptr %a, i32 256
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds half, ptr %a, i32 -255
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds half, ptr %a, i32 -256
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am63 = getelementptr inbounds half, ptr %a, i32 -63
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am64 = getelementptr inbounds half, ptr %a, i32 -64
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds half, ptr %a, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a1 = getelementptr inbounds half, ptr %a, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am1 = getelementptr inbounds half, ptr %a, i32 -1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a255 = getelementptr inbounds half, ptr %a, i32 255
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds half, ptr %a, i32 256
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds half, ptr %a, i32 -255
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds half, ptr %a, i32 -256
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am63 = getelementptr inbounds half, ptr %a, i32 -63
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am64 = getelementptr inbounds half, ptr %a, i32 -64
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds half, ptr %a, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-NOFP-LABEL: 'testhalf'
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds half, ptr %a, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds half, ptr %a, i32 -1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds half, ptr %a, i32 255
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a256 = getelementptr inbounds half, ptr %a, i32 256
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds half, ptr %a, i32 -255
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds half, ptr %a, i32 -256
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds half, ptr %a, i32 -63
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds half, ptr %a, i32 -64
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds half, ptr %a, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds half, ptr %a, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds half, ptr %a, i32 -1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds half, ptr %a, i32 255
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a256 = getelementptr inbounds half, ptr %a, i32 256
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds half, ptr %a, i32 -255
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds half, ptr %a, i32 -256
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds half, ptr %a, i32 -63
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds half, ptr %a, i32 -64
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds half, ptr %a, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-FP-LABEL: 'testhalf'
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1 = getelementptr inbounds half, ptr %a, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am1 = getelementptr inbounds half, ptr %a, i32 -1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a255 = getelementptr inbounds half, ptr %a, i32 255
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a256 = getelementptr inbounds half, ptr %a, i32 256
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds half, ptr %a, i32 -255
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am256 = getelementptr inbounds half, ptr %a, i32 -256
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am63 = getelementptr inbounds half, ptr %a, i32 -63
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds half, ptr %a, i32 -64
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds half, ptr %a, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a1 = getelementptr inbounds half, ptr %a, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %am1 = getelementptr inbounds half, ptr %a, i32 -1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a255 = getelementptr inbounds half, ptr %a, i32 255
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a256 = getelementptr inbounds half, ptr %a, i32 256
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds half, ptr %a, i32 -255
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am256 = getelementptr inbounds half, ptr %a, i32 -256
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %am63 = getelementptr inbounds half, ptr %a, i32 -63
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds half, ptr %a, i32 -64
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds half, ptr %a, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-MVE-LABEL: 'testhalf'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds half, ptr %a, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds half, ptr %a, i32 -1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds half, ptr %a, i32 255
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds half, ptr %a, i32 256
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds half, ptr %a, i32 -255
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds half, ptr %a, i32 -256
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds half, ptr %a, i32 -63
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds half, ptr %a, i32 -64
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds half, ptr %a, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds half, ptr %a, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds half, ptr %a, i32 -1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds half, ptr %a, i32 255
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds half, ptr %a, i32 256
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds half, ptr %a, i32 -255
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds half, ptr %a, i32 -256
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds half, ptr %a, i32 -63
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds half, ptr %a, i32 -64
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds half, ptr %a, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'testhalf'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds half, ptr %a, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds half, ptr %a, i32 -1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds half, ptr %a, i32 255
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds half, ptr %a, i32 256
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds half, ptr %a, i32 -255
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds half, ptr %a, i32 -256
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds half, ptr %a, i32 -63
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds half, ptr %a, i32 -64
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds half, ptr %a, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds half, ptr %a, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds half, ptr %a, i32 -1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds half, ptr %a, i32 255
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds half, ptr %a, i32 256
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds half, ptr %a, i32 -255
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds half, ptr %a, i32 -256
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds half, ptr %a, i32 -63
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds half, ptr %a, i32 -64
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds half, ptr %a, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-T32-LABEL: 'testhalf'
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1 = getelementptr inbounds half, ptr %a, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am1 = getelementptr inbounds half, ptr %a, i32 -1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a255 = getelementptr inbounds half, ptr %a, i32 255
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a256 = getelementptr inbounds half, ptr %a, i32 256
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds half, ptr %a, i32 -255
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am256 = getelementptr inbounds half, ptr %a, i32 -256
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am63 = getelementptr inbounds half, ptr %a, i32 -63
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds half, ptr %a, i32 -64
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds half, ptr %a, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a1 = getelementptr inbounds half, ptr %a, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %am1 = getelementptr inbounds half, ptr %a, i32 -1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a255 = getelementptr inbounds half, ptr %a, i32 255
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a256 = getelementptr inbounds half, ptr %a, i32 256
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds half, ptr %a, i32 -255
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am256 = getelementptr inbounds half, ptr %a, i32 -256
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %am63 = getelementptr inbounds half, ptr %a, i32 -63
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds half, ptr %a, i32 -64
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds half, ptr %a, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-A32-LABEL: 'testhalf'
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1 = getelementptr inbounds half, ptr %a, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am1 = getelementptr inbounds half, ptr %a, i32 -1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a255 = getelementptr inbounds half, ptr %a, i32 255
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds half, ptr %a, i32 256
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds half, ptr %a, i32 -255
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds half, ptr %a, i32 -256
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am63 = getelementptr inbounds half, ptr %a, i32 -63
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am64 = getelementptr inbounds half, ptr %a, i32 -64
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds half, ptr %a, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a1 = getelementptr inbounds half, ptr %a, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %am1 = getelementptr inbounds half, ptr %a, i32 -1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a255 = getelementptr inbounds half, ptr %a, i32 255
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds half, ptr %a, i32 256
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds half, ptr %a, i32 -255
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds half, ptr %a, i32 -256
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %am63 = getelementptr inbounds half, ptr %a, i32 -63
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %am64 = getelementptr inbounds half, ptr %a, i32 -64
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds half, ptr %a, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %a1 = getelementptr inbounds half, ptr %a, i32 1
   %am1 = getelementptr inbounds half, ptr %a, i32 -1
@@ -533,102 +533,102 @@ define void @testhalf(ptr %a, i32 %i) {
 
 define void @testfloat(ptr %a, i32 %i) {
 ; CHECK-V6M-LABEL: 'testfloat'
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds float, ptr %a, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am1 = getelementptr inbounds float, ptr %a, i32 -1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a255 = getelementptr inbounds float, ptr %a, i32 255
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds float, ptr %a, i32 256
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds float, ptr %a, i32 -255
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds float, ptr %a, i32 -256
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am63 = getelementptr inbounds float, ptr %a, i32 -63
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am64 = getelementptr inbounds float, ptr %a, i32 -64
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds float, ptr %a, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds float, ptr %a, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am1 = getelementptr inbounds float, ptr %a, i32 -1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a255 = getelementptr inbounds float, ptr %a, i32 255
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds float, ptr %a, i32 256
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds float, ptr %a, i32 -255
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds float, ptr %a, i32 -256
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am63 = getelementptr inbounds float, ptr %a, i32 -63
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am64 = getelementptr inbounds float, ptr %a, i32 -64
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds float, ptr %a, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-NOFP-LABEL: 'testfloat'
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds float, ptr %a, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds float, ptr %a, i32 -1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds float, ptr %a, i32 255
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a256 = getelementptr inbounds float, ptr %a, i32 256
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds float, ptr %a, i32 -255
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds float, ptr %a, i32 -256
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds float, ptr %a, i32 -63
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am64 = getelementptr inbounds float, ptr %a, i32 -64
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds float, ptr %a, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds float, ptr %a, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds float, ptr %a, i32 -1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds float, ptr %a, i32 255
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a256 = getelementptr inbounds float, ptr %a, i32 256
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds float, ptr %a, i32 -255
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds float, ptr %a, i32 -256
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds float, ptr %a, i32 -63
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %am64 = getelementptr inbounds float, ptr %a, i32 -64
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds float, ptr %a, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-FP-LABEL: 'testfloat'
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds float, ptr %a, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds float, ptr %a, i32 -1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds float, ptr %a, i32 255
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds float, ptr %a, i32 256
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds float, ptr %a, i32 -255
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds float, ptr %a, i32 -256
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds float, ptr %a, i32 -63
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds float, ptr %a, i32 -64
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds float, ptr %a, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds float, ptr %a, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds float, ptr %a, i32 -1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds float, ptr %a, i32 255
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds float, ptr %a, i32 256
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds float, ptr %a, i32 -255
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds float, ptr %a, i32 -256
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds float, ptr %a, i32 -63
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds float, ptr %a, i32 -64
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds float, ptr %a, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-MVE-LABEL: 'testfloat'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds float, ptr %a, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds float, ptr %a, i32 -1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds float, ptr %a, i32 255
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a256 = getelementptr inbounds float, ptr %a, i32 256
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds float, ptr %a, i32 -255
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds float, ptr %a, i32 -256
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds float, ptr %a, i32 -63
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am64 = getelementptr inbounds float, ptr %a, i32 -64
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds float, ptr %a, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds float, ptr %a, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds float, ptr %a, i32 -1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds float, ptr %a, i32 255
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a256 = getelementptr inbounds float, ptr %a, i32 256
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds float, ptr %a, i32 -255
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds float, ptr %a, i32 -256
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds float, ptr %a, i32 -63
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %am64 = getelementptr inbounds float, ptr %a, i32 -64
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds float, ptr %a, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'testfloat'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds float, ptr %a, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds float, ptr %a, i32 -1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds float, ptr %a, i32 255
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds float, ptr %a, i32 256
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds float, ptr %a, i32 -255
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds float, ptr %a, i32 -256
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds float, ptr %a, i32 -63
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds float, ptr %a, i32 -64
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds float, ptr %a, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds float, ptr %a, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds float, ptr %a, i32 -1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds float, ptr %a, i32 255
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds float, ptr %a, i32 256
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds float, ptr %a, i32 -255
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds float, ptr %a, i32 -256
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds float, ptr %a, i32 -63
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds float, ptr %a, i32 -64
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds float, ptr %a, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-T32-LABEL: 'testfloat'
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds float, ptr %a, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds float, ptr %a, i32 -1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds float, ptr %a, i32 255
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds float, ptr %a, i32 256
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds float, ptr %a, i32 -255
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds float, ptr %a, i32 -256
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds float, ptr %a, i32 -63
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds float, ptr %a, i32 -64
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds float, ptr %a, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds float, ptr %a, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds float, ptr %a, i32 -1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds float, ptr %a, i32 255
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds float, ptr %a, i32 256
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds float, ptr %a, i32 -255
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds float, ptr %a, i32 -256
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds float, ptr %a, i32 -63
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds float, ptr %a, i32 -64
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds float, ptr %a, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-A32-LABEL: 'testfloat'
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds float, ptr %a, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds float, ptr %a, i32 -1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds float, ptr %a, i32 255
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds float, ptr %a, i32 256
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds float, ptr %a, i32 -255
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds float, ptr %a, i32 -256
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds float, ptr %a, i32 -63
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds float, ptr %a, i32 -64
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds float, ptr %a, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds float, ptr %a, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds float, ptr %a, i32 -1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds float, ptr %a, i32 255
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds float, ptr %a, i32 256
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds float, ptr %a, i32 -255
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds float, ptr %a, i32 -256
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds float, ptr %a, i32 -63
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds float, ptr %a, i32 -64
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds float, ptr %a, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %a1 = getelementptr inbounds float, ptr %a, i32 1
   %am1 = getelementptr inbounds float, ptr %a, i32 -1
@@ -647,102 +647,102 @@ define void @testfloat(ptr %a, i32 %i) {
 
 define void @testdouble(ptr %a, i32 %i) {
 ; CHECK-V6M-LABEL: 'testdouble'
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds double, ptr %a, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am1 = getelementptr inbounds double, ptr %a, i32 -1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a255 = getelementptr inbounds double, ptr %a, i32 127
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds double, ptr %a, i32 128
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds double, ptr %a, i32 -127
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds double, ptr %a, i32 -128
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds double, ptr %a, i32 511
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds double, ptr %a, i32 512
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am63 = getelementptr inbounds double, ptr %a, i32 -31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am64 = getelementptr inbounds double, ptr %a, i32 -32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds double, ptr %a, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds double, ptr %a, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am1 = getelementptr inbounds double, ptr %a, i32 -1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a255 = getelementptr inbounds double, ptr %a, i32 127
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds double, ptr %a, i32 128
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds double, ptr %a, i32 -127
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds double, ptr %a, i32 -128
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds double, ptr %a, i32 511
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds double, ptr %a, i32 512
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am63 = getelementptr inbounds double, ptr %a, i32 -31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am64 = getelementptr inbounds double, ptr %a, i32 -32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds double, ptr %a, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-NOFP-LABEL: 'testdouble'
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds double, ptr %a, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds double, ptr %a, i32 -1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds double, ptr %a, i32 127
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds double, ptr %a, i32 128
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds double, ptr %a, i32 -127
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds double, ptr %a, i32 -128
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds double, ptr %a, i32 511
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds double, ptr %a, i32 512
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds double, ptr %a, i32 -31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds double, ptr %a, i32 -32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds double, ptr %a, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds double, ptr %a, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds double, ptr %a, i32 -1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds double, ptr %a, i32 127
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds double, ptr %a, i32 128
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds double, ptr %a, i32 -127
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds double, ptr %a, i32 -128
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds double, ptr %a, i32 511
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds double, ptr %a, i32 512
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds double, ptr %a, i32 -31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds double, ptr %a, i32 -32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds double, ptr %a, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-FP-LABEL: 'testdouble'
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds double, ptr %a, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds double, ptr %a, i32 -1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds double, ptr %a, i32 127
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds double, ptr %a, i32 128
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds double, ptr %a, i32 -127
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds double, ptr %a, i32 -128
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds double, ptr %a, i32 511
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds double, ptr %a, i32 512
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds double, ptr %a, i32 -31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds double, ptr %a, i32 -32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds double, ptr %a, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds double, ptr %a, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds double, ptr %a, i32 -1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds double, ptr %a, i32 127
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds double, ptr %a, i32 128
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds double, ptr %a, i32 -127
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds double, ptr %a, i32 -128
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds double, ptr %a, i32 511
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds double, ptr %a, i32 512
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds double, ptr %a, i32 -31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds double, ptr %a, i32 -32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds double, ptr %a, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-MVE-LABEL: 'testdouble'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds double, ptr %a, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds double, ptr %a, i32 -1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds double, ptr %a, i32 127
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds double, ptr %a, i32 128
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds double, ptr %a, i32 -127
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds double, ptr %a, i32 -128
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds double, ptr %a, i32 511
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds double, ptr %a, i32 512
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds double, ptr %a, i32 -31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds double, ptr %a, i32 -32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds double, ptr %a, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds double, ptr %a, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds double, ptr %a, i32 -1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds double, ptr %a, i32 127
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds double, ptr %a, i32 128
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds double, ptr %a, i32 -127
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds double, ptr %a, i32 -128
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds double, ptr %a, i32 511
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds double, ptr %a, i32 512
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds double, ptr %a, i32 -31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds double, ptr %a, i32 -32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds double, ptr %a, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'testdouble'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds double, ptr %a, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds double, ptr %a, i32 -1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds double, ptr %a, i32 127
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds double, ptr %a, i32 128
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds double, ptr %a, i32 -127
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds double, ptr %a, i32 -128
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds double, ptr %a, i32 511
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds double, ptr %a, i32 512
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds double, ptr %a, i32 -31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds double, ptr %a, i32 -32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds double, ptr %a, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds double, ptr %a, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds double, ptr %a, i32 -1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds double, ptr %a, i32 127
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds double, ptr %a, i32 128
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds double, ptr %a, i32 -127
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds double, ptr %a, i32 -128
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds double, ptr %a, i32 511
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds double, ptr %a, i32 512
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds double, ptr %a, i32 -31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds double, ptr %a, i32 -32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds double, ptr %a, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-T32-LABEL: 'testdouble'
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds double, ptr %a, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds double, ptr %a, i32 -1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds double, ptr %a, i32 127
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds double, ptr %a, i32 128
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds double, ptr %a, i32 -127
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds double, ptr %a, i32 -128
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds double, ptr %a, i32 511
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds double, ptr %a, i32 512
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds double, ptr %a, i32 -31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds double, ptr %a, i32 -32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds double, ptr %a, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds double, ptr %a, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds double, ptr %a, i32 -1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds double, ptr %a, i32 127
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds double, ptr %a, i32 128
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds double, ptr %a, i32 -127
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds double, ptr %a, i32 -128
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds double, ptr %a, i32 511
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds double, ptr %a, i32 512
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds double, ptr %a, i32 -31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds double, ptr %a, i32 -32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds double, ptr %a, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-A32-LABEL: 'testdouble'
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds double, ptr %a, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds double, ptr %a, i32 -1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds double, ptr %a, i32 127
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds double, ptr %a, i32 128
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds double, ptr %a, i32 -127
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds double, ptr %a, i32 -128
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds double, ptr %a, i32 511
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds double, ptr %a, i32 512
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds double, ptr %a, i32 -31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds double, ptr %a, i32 -32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds double, ptr %a, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds double, ptr %a, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds double, ptr %a, i32 -1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds double, ptr %a, i32 127
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds double, ptr %a, i32 128
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds double, ptr %a, i32 -127
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds double, ptr %a, i32 -128
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds double, ptr %a, i32 511
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds double, ptr %a, i32 512
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds double, ptr %a, i32 -31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds double, ptr %a, i32 -32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds double, ptr %a, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %a1 = getelementptr inbounds double, ptr %a, i32 1
   %am1 = getelementptr inbounds double, ptr %a, i32 -1
@@ -761,375 +761,375 @@ define void @testdouble(ptr %a, i32 %i) {
 
 define void @testvecs(i32 %i) {
 ; CHECK-V6M-LABEL: 'testvecs'
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-NOFP-LABEL: 'testvecs'
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-FP-LABEL: 'testvecs'
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-MVE-LABEL: 'testvecs'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'testvecs'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-T32-LABEL: 'testvecs'
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
+; CHECK-T32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-A32-LABEL: 'testvecs'
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
+; CHECK-A32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 
   %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
diff --git a/llvm/test/Analysis/CostModel/ARM/immediates.ll b/llvm/test/Analysis/CostModel/ARM/immediates.ll
index ed13636..cd42313 100644
--- a/llvm/test/Analysis/CostModel/ARM/immediates.ll
+++ b/llvm/test/Analysis/CostModel/ARM/immediates.ll
@@ -1,145 +1,53 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size  -mtriple=thumbv8m.base   | FileCheck %s --check-prefix=CHECK-T1-SIZE
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size  -mtriple=thumbv8m.main   | FileCheck %s --check-prefix=CHECK-T2-SIZE
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency    -mtriple=thumbv8m.base   | FileCheck %s --check-prefix=CHECK-T1-LATENCY
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency    -mtriple=thumbv8m.main   | FileCheck %s --check-prefix=CHECK-T2-LATENCY
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=throughput -mtriple=thumbv8m.base   | FileCheck %s --check-prefix=CHECK-T1-THROUGHPUT
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=throughput -mtriple=thumbv8m.main   | FileCheck %s --check-prefix=CHECK-T2-THROUGHPUT
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output  -mtriple=thumbv8m.base   | FileCheck %s --check-prefix=CHECK-T1
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output  -mtriple=thumbv8m.main   | FileCheck %s --check-prefix=CHECK-T2
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define i32 @const_costs() {
-; CHECK-T1-SIZE-LABEL: 'const_costs'
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add_1 = add i32 undef, 1
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add_32767 = add i32 undef, 32767
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sub_1 = sub i32 undef, 1
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sub_32768 = sub i32 undef, 32768
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_2 = mul i32 undef, 2
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_3 = mul i32 undef, 3
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_27 = mul i32 undef, 27
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_255 = and i32 undef, 255
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_65535 = and i32 undef, 65535
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_1 = and i32 undef, 1
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xor_1 = xor i32 undef, 1
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xor_7 = xor i32 undef, 7
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep_1 = getelementptr i32, ptr undef, i32 1
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep_16 = getelementptr i32, ptr undef, i32 16
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_244 = icmp ne i32 undef, 244
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_256 = icmp uge i32 undef, 256
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_1024 = icmp ult i32 undef, 1024
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %select_1_0 = select i1 undef, i32 1, i32 0
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %select_7_255 = select i1 undef, i32 7, i32 255
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 1
+; CHECK-T1-LABEL: 'const_costs'
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %add_1 = add i32 undef, 1
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %add_32767 = add i32 undef, 32767
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %sub_1 = sub i32 undef, 1
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %sub_32768 = sub i32 undef, 32768
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %mul_2 = mul i32 undef, 2
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %mul_3 = mul i32 undef, 3
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %mul_27 = mul i32 undef, 27
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %and_255 = and i32 undef, 255
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %and_65535 = and i32 undef, 65535
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %and_1 = and i32 undef, 1
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %xor_1 = xor i32 undef, 1
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %xor_7 = xor i32 undef, 7
+; CHECK-T1-NEXT:  Cost Model: Found costs of 0 for: %gep_1 = getelementptr i32, ptr undef, i32 1
+; CHECK-T1-NEXT:  Cost Model: Found costs of 0 for: %gep_16 = getelementptr i32, ptr undef, i32 16
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %cmp_244 = icmp ne i32 undef, 244
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %cmp_256 = icmp uge i32 undef, 256
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %cmp_1024 = icmp ult i32 undef, 1024
+; CHECK-T1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %select_1_0 = select i1 undef, i32 1, i32 0
+; CHECK-T1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %select_7_255 = select i1 undef, i32 7, i32 255
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: ret i32 1
 ;
-; CHECK-T2-SIZE-LABEL: 'const_costs'
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add_1 = add i32 undef, 1
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add_32767 = add i32 undef, 32767
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sub_1 = sub i32 undef, 1
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sub_32768 = sub i32 undef, 32768
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_2 = mul i32 undef, 2
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_3 = mul i32 undef, 3
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_27 = mul i32 undef, 27
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_255 = and i32 undef, 255
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_65535 = and i32 undef, 65535
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_1 = and i32 undef, 1
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xor_1 = xor i32 undef, 1
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xor_7 = xor i32 undef, 7
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep_1 = getelementptr i32, ptr undef, i32 1
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep_16 = getelementptr i32, ptr undef, i32 16
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_244 = icmp ne i32 undef, 244
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_256 = icmp uge i32 undef, 256
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_1024 = icmp ult i32 undef, 1024
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %select_1_0 = select i1 undef, i32 1, i32 0
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %select_7_255 = select i1 undef, i32 7, i32 255
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 1
-;
-; CHECK-T1-LATENCY-LABEL: 'const_costs'
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add_1 = add i32 undef, 1
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add_32767 = add i32 undef, 32767
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sub_1 = sub i32 undef, 1
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sub_32768 = sub i32 undef, 32768
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_2 = mul i32 undef, 2
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_3 = mul i32 undef, 3
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_27 = mul i32 undef, 27
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_255 = and i32 undef, 255
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_65535 = and i32 undef, 65535
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_1 = and i32 undef, 1
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xor_1 = xor i32 undef, 1
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xor_7 = xor i32 undef, 7
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep_1 = getelementptr i32, ptr undef, i32 1
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep_16 = getelementptr i32, ptr undef, i32 16
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_244 = icmp ne i32 undef, 244
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_256 = icmp uge i32 undef, 256
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_1024 = icmp ult i32 undef, 1024
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select_1_0 = select i1 undef, i32 1, i32 0
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select_7_255 = select i1 undef, i32 7, i32 255
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 1
-;
-; CHECK-T2-LATENCY-LABEL: 'const_costs'
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add_1 = add i32 undef, 1
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add_32767 = add i32 undef, 32767
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sub_1 = sub i32 undef, 1
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sub_32768 = sub i32 undef, 32768
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_2 = mul i32 undef, 2
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_3 = mul i32 undef, 3
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_27 = mul i32 undef, 27
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_255 = and i32 undef, 255
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_65535 = and i32 undef, 65535
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_1 = and i32 undef, 1
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xor_1 = xor i32 undef, 1
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xor_7 = xor i32 undef, 7
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep_1 = getelementptr i32, ptr undef, i32 1
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep_16 = getelementptr i32, ptr undef, i32 16
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_244 = icmp ne i32 undef, 244
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_256 = icmp uge i32 undef, 256
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_1024 = icmp ult i32 undef, 1024
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select_1_0 = select i1 undef, i32 1, i32 0
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select_7_255 = select i1 undef, i32 7, i32 255
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 1
-;
-; CHECK-T1-THROUGHPUT-LABEL: 'const_costs'
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add_1 = add i32 undef, 1
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add_32767 = add i32 undef, 32767
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sub_1 = sub i32 undef, 1
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sub_32768 = sub i32 undef, 32768
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_2 = mul i32 undef, 2
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_3 = mul i32 undef, 3
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_27 = mul i32 undef, 27
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_255 = and i32 undef, 255
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_65535 = and i32 undef, 65535
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_1 = and i32 undef, 1
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xor_1 = xor i32 undef, 1
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xor_7 = xor i32 undef, 7
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep_1 = getelementptr i32, ptr undef, i32 1
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep_16 = getelementptr i32, ptr undef, i32 16
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_244 = icmp ne i32 undef, 244
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_256 = icmp uge i32 undef, 256
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_1024 = icmp ult i32 undef, 1024
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select_1_0 = select i1 undef, i32 1, i32 0
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select_7_255 = select i1 undef, i32 7, i32 255
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 1
-;
-; CHECK-T2-THROUGHPUT-LABEL: 'const_costs'
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add_1 = add i32 undef, 1
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add_32767 = add i32 undef, 32767
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sub_1 = sub i32 undef, 1
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sub_32768 = sub i32 undef, 32768
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_2 = mul i32 undef, 2
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_3 = mul i32 undef, 3
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_27 = mul i32 undef, 27
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_255 = and i32 undef, 255
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_65535 = and i32 undef, 65535
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_1 = and i32 undef, 1
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xor_1 = xor i32 undef, 1
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xor_7 = xor i32 undef, 7
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep_1 = getelementptr i32, ptr undef, i32 1
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep_16 = getelementptr i32, ptr undef, i32 16
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_244 = icmp ne i32 undef, 244
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_256 = icmp uge i32 undef, 256
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_1024 = icmp ult i32 undef, 1024
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select_1_0 = select i1 undef, i32 1, i32 0
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select_7_255 = select i1 undef, i32 7, i32 255
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 1
+; CHECK-T2-LABEL: 'const_costs'
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %add_1 = add i32 undef, 1
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %add_32767 = add i32 undef, 32767
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %sub_1 = sub i32 undef, 1
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %sub_32768 = sub i32 undef, 32768
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %mul_2 = mul i32 undef, 2
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %mul_3 = mul i32 undef, 3
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %mul_27 = mul i32 undef, 27
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %and_255 = and i32 undef, 255
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %and_65535 = and i32 undef, 65535
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %and_1 = and i32 undef, 1
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %xor_1 = xor i32 undef, 1
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %xor_7 = xor i32 undef, 7
+; CHECK-T2-NEXT:  Cost Model: Found costs of 0 for: %gep_1 = getelementptr i32, ptr undef, i32 1
+; CHECK-T2-NEXT:  Cost Model: Found costs of 0 for: %gep_16 = getelementptr i32, ptr undef, i32 16
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %cmp_244 = icmp ne i32 undef, 244
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %cmp_256 = icmp uge i32 undef, 256
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %cmp_1024 = icmp ult i32 undef, 1024
+; CHECK-T2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %select_1_0 = select i1 undef, i32 1, i32 0
+; CHECK-T2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %select_7_255 = select i1 undef, i32 7, i32 255
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: ret i32 1
 ;
   %add_1 = add i32 undef, 1
   %add_32767 = add i32 undef, 32767
diff --git a/llvm/test/Analysis/CostModel/ARM/insertelement.ll b/llvm/test/Analysis/CostModel/ARM/insertelement.ll
index 5a922dd..f14b200 100644
--- a/llvm/test/Analysis/CostModel/ARM/insertelement.ll
+++ b/llvm/test/Analysis/CostModel/ARM/insertelement.ll
@@ -1,4 +1,5 @@
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
 target triple = "thumbv7-apple-ios6.0.0"
@@ -7,12 +8,16 @@ target triple = "thumbv7-apple-ios6.0.0"
 ; due to renaming constraints.
 %T_i8v = type <8 x i8>
 %T_i8 = type i8
-; CHECK: insertelement_i8
-define void @insertelement_i8(ptr %saddr,
-                           ptr %vaddr) {
+define void @insertelement_i8(ptr %saddr, ptr %vaddr) {
+; CHECK-LABEL: 'insertelement_i8'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <8 x i8>, ptr %vaddr, align 4
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load i8, ptr %saddr, align 1
+; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2 = insertelement <8 x i8> %v0, i8 %v1, i32 1
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: store <8 x i8> %v2, ptr %vaddr, align 4
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
   %v0 = load %T_i8v, ptr %vaddr
   %v1 = load %T_i8, ptr %saddr
-;CHECK: estimated cost of 3 for {{.*}} insertelement <8 x i8>
   %v2 = insertelement %T_i8v %v0, %T_i8 %v1, i32 1
   store %T_i8v %v2, ptr %vaddr
   ret void
@@ -21,12 +26,16 @@ define void @insertelement_i8(ptr %saddr,
 
 %T_i16v = type <4 x i16>
 %T_i16 = type i16
-; CHECK: insertelement_i16
-define void @insertelement_i16(ptr %saddr,
-                           ptr %vaddr) {
+define void @insertelement_i16(ptr %saddr, ptr %vaddr) {
+; CHECK-LABEL: 'insertelement_i16'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i16>, ptr %vaddr, align 4
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load i16, ptr %saddr, align 2
+; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2 = insertelement <4 x i16> %v0, i16 %v1, i32 1
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: store <4 x i16> %v2, ptr %vaddr, align 4
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
   %v0 = load %T_i16v, ptr %vaddr
   %v1 = load %T_i16, ptr %saddr
-;CHECK: estimated cost of 3 for {{.*}} insertelement <4 x i16>
   %v2 = insertelement %T_i16v %v0, %T_i16 %v1, i32 1
   store %T_i16v %v2, ptr %vaddr
   ret void
@@ -34,12 +43,16 @@ define void @insertelement_i16(ptr %saddr,
 
 %T_i32v = type <2 x i32>
 %T_i32 = type i32
-; CHECK: insertelement_i32
-define void @insertelement_i32(ptr %saddr,
-                           ptr %vaddr) {
+define void @insertelement_i32(ptr %saddr, ptr %vaddr) {
+; CHECK-LABEL: 'insertelement_i32'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <2 x i32>, ptr %vaddr, align 4
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load i32, ptr %saddr, align 4
+; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2 = insertelement <2 x i32> %v0, i32 %v1, i32 1
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: store <2 x i32> %v2, ptr %vaddr, align 4
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
   %v0 = load %T_i32v, ptr %vaddr
   %v1 = load %T_i32, ptr %saddr
-;CHECK: estimated cost of 3 for {{.*}} insertelement <2 x i32>
   %v2 = insertelement %T_i32v %v0, %T_i32 %v1, i32 1
   store %T_i32v %v2, ptr %vaddr
   ret void
diff --git a/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll
deleted file mode 100644
index 6377437..0000000
--- a/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll
+++ /dev/null
@@ -1,371 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -mtriple=armv8.1m.main -mattr=+mve.fp -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=throughput   < %s | FileCheck %s --check-prefix=THRU
-; RUN: opt -mtriple=armv8.1m.main -mattr=+mve.fp -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency      < %s | FileCheck %s --check-prefix=LATE
-; RUN: opt -mtriple=armv8.1m.main -mattr=+mve.fp -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size    < %s | FileCheck %s --check-prefix=SIZE
-; RUN: opt -mtriple=armv8.1m.main -mattr=+mve.fp -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency < %s | FileCheck %s --check-prefix=SIZE_LATE
-
-target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
-
-; Test a cross-section of intrinsics for various cost-kinds.
-; Other test files may check for accuracy of a particular intrinsic
-; across subtargets or types. This is just a basic correctness check using an
-; ARM target and a legal scalar type (i32/float) and/or an
-; illegal vector type (16 x i32/float).
-
-declare i32 @llvm.smax.i32(i32, i32)
-declare <16 x i32> @llvm.smax.v16i32(<16 x i32>, <16 x i32>)
-
-declare float @llvm.fmuladd.f32(float, float, float)
-declare <16 x float> @llvm.fmuladd.v16f32(<16 x float>, <16 x float>, <16 x float>)
-
-declare float @llvm.log2.f32(float)
-declare <16 x float> @llvm.log2.v16f32(<16 x float>)
-
-declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata)
-declare <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float>, <16 x float>, metadata, metadata)
-
-declare float @llvm.maximum.f32(float, float)
-declare <16 x float> @llvm.maximum.v16f32(<16 x float>, <16 x float>)
-
-declare i32 @llvm.cttz.i32(i32, i1)
-declare <16 x i32> @llvm.cttz.v16i32(<16 x i32>, i1)
-
-declare i32 @llvm.ctlz.i32(i32, i1)
-declare <16 x i32> @llvm.ctlz.v16i32(<16 x i32>, i1)
-
-declare i32 @llvm.fshl.i32(i32, i32, i32)
-declare <16 x i32> @llvm.fshl.v16i32(<16 x i32>, <16 x i32>, <16 x i32>)
-
-declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>)
-declare void @llvm.masked.scatter.v16f32.v16p0(<16 x float>, <16 x ptr>, i32, <16 x i1>)
-declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>)
-
-declare void @llvm.memcpy.p0.p0.i32(ptr, ptr, i32, i1)
-
-declare i32 @llvm.ssa.copy.i32(i32)
-declare float @llvm.ssa.copy.f32(float)
-declare ptr @llvm.ssa.copy.p0(ptr)
-
-define void @smax(i32 %a, i32 %b, <16 x i32> %va, <16 x i32> %vb) {
-; THRU-LABEL: 'smax'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.smax.i32(i32 %a, i32 %b)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'smax'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.smax.i32(i32 %a, i32 %b)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'smax'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %s = call i32 @llvm.smax.i32(i32 %a, i32 %b)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'smax'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.smax.i32(i32 %a, i32 %b)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %s = call i32 @llvm.smax.i32(i32 %a, i32 %b)
-  %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb)
-  ret void
-}
-
-define void @fmuladd(float %a, float %b, float %c, <16 x float> %va, <16 x float> %vb, <16 x float> %vc) {
-; THRU-LABEL: 'fmuladd'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'fmuladd'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'fmuladd'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'fmuladd'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
-  %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc)
-  ret void
-}
-
-define void @log2(float %a, <16 x float> %va) {
-; THRU-LABEL: 'log2'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.log2.f32(float %a)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'log2'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.log2.f32(float %a)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'log2'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.log2.f32(float %a)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'log2'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.log2.f32(float %a)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %s = call float @llvm.log2.f32(float %a)
-  %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va)
-  ret void
-}
-
-define void @constrained_fadd(float %a, <16 x float> %va) strictfp {
-; THRU-LABEL: 'constrained_fadd'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.experimental.constrained.fadd.f32(float %a, float %a, metadata !"round.dynamic", metadata !"fpexcept.ignore")
-; THRU-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore")
-; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'constrained_fadd'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.experimental.constrained.fadd.f32(float %a, float %a, metadata !"round.dynamic", metadata !"fpexcept.ignore")
-; LATE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore")
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'constrained_fadd'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.experimental.constrained.fadd.f32(float %a, float %a, metadata !"round.dynamic", metadata !"fpexcept.ignore")
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore")
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'constrained_fadd'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.experimental.constrained.fadd.f32(float %a, float %a, metadata !"round.dynamic", metadata !"fpexcept.ignore")
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore")
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %s = call float @llvm.experimental.constrained.fadd.f32(float %a, float %a, metadata !"round.dynamic", metadata !"fpexcept.ignore")
-  %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore")
-  ret void
-}
-
-define void @fmaximum(float %a, float %b, <16 x float> %va, <16 x float> %vb) {
-; THRU-LABEL: 'fmaximum'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.maximum.f32(float %a, float %b)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'fmaximum'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.maximum.f32(float %a, float %b)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'fmaximum'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.maximum.f32(float %a, float %b)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'fmaximum'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.maximum.f32(float %a, float %b)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %s = call float @llvm.maximum.f32(float %a, float %b)
-  %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb)
-  ret void
-}
-
-define void @cttz(i32 %a, <16 x i32> %va) {
-; THRU-LABEL: 'cttz'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'cttz'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'cttz'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'cttz'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
-  %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
-  ret void
-}
-
-define void @ctlz(i32 %a, <16 x i32> %va) {
-; THRU-LABEL: 'ctlz'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.ctlz.i32(i32 %a, i1 true)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %va, i1 true)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'ctlz'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.ctlz.i32(i32 %a, i1 true)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %va, i1 true)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'ctlz'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.ctlz.i32(i32 %a, i1 true)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %va, i1 true)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'ctlz'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.ctlz.i32(i32 %a, i1 true)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %va, i1 true)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %s = call i32 @llvm.ctlz.i32(i32 %a, i1 true)
-  %v = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %va, i1 true)
-  ret void
-}
-
-define void @fshl(i32 %a, i32 %b, i32 %c, <16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc) {
-; THRU-LABEL: 'fshl'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'fshl'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'fshl'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'fshl'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
-  %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc)
-  ret void
-}
-
-define void @maskedgather(<16 x ptr> %va, <16 x i1> %vb, <16 x float> %vc) {
-; THRU-LABEL: 'maskedgather'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 %va, <16 x i1> %vb, <16 x float> %vc)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'maskedgather'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 %va, <16 x i1> %vb, <16 x float> %vc)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'maskedgather'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 %va, <16 x i1> %vb, <16 x float> %vc)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'maskedgather'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 %va, <16 x i1> %vb, <16 x float> %vc)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %va, i32 1, <16 x i1> %vb, <16 x float> %vc)
-  ret void
-}
-
-define void @maskedscatter(<16 x float> %va, <16 x ptr> %vb, <16 x i1> %vc) {
-; THRU-LABEL: 'maskedscatter'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> align 1 %vb, <16 x i1> %vc)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'maskedscatter'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> align 1 %vb, <16 x i1> %vc)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'maskedscatter'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> align 1 %vb, <16 x i1> %vc)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'maskedscatter'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> align 1 %vb, <16 x i1> %vc)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> %vb, i32 1, <16 x i1> %vc)
-  ret void
-}
-
-define void @reduce_fmax(<16 x float> %va) {
-; THRU-LABEL: 'reduce_fmax'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'reduce_fmax'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'reduce_fmax'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'reduce_fmax'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
-  ret void
-}
-
-define void @memcpy(ptr %a, ptr %b, i32 %c) {
-; THRU-LABEL: 'memcpy'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.memcpy.p0.p0.i32(ptr align 1 %a, ptr align 1 %b, i32 32, i1 false)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'memcpy'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.memcpy.p0.p0.i32(ptr align 1 %a, ptr align 1 %b, i32 32, i1 false)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'memcpy'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.memcpy.p0.p0.i32(ptr align 1 %a, ptr align 1 %b, i32 32, i1 false)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'memcpy'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.memcpy.p0.p0.i32(ptr align 1 %a, ptr align 1 %b, i32 32, i1 false)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  call void @llvm.memcpy.p0.p0.i32(ptr align 1 %a, ptr align 1 %b, i32 32, i1 false)
-  ret void
-}
-
-define void @ssa_copy() {
-  ; CHECK: %{{.*}} = llvm.intr.ssa.copy %{{.*}} : f32
-; THRU-LABEL: 'ssa_copy'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i = call i32 @llvm.ssa.copy.i32(i32 undef)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f = call float @llvm.ssa.copy.f32(float undef)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p = call ptr @llvm.ssa.copy.p0(ptr undef)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'ssa_copy'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i = call i32 @llvm.ssa.copy.i32(i32 undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f = call float @llvm.ssa.copy.f32(float undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p = call ptr @llvm.ssa.copy.p0(ptr undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'ssa_copy'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i = call i32 @llvm.ssa.copy.i32(i32 undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f = call float @llvm.ssa.copy.f32(float undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p = call ptr @llvm.ssa.copy.p0(ptr undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'ssa_copy'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i = call i32 @llvm.ssa.copy.i32(i32 undef)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f = call float @llvm.ssa.copy.f32(float undef)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p = call ptr @llvm.ssa.copy.p0(ptr undef)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %i = call i32 @llvm.ssa.copy.i32(i32 undef)
-  %f = call float @llvm.ssa.copy.f32(float undef)
-  %p = call ptr @llvm.ssa.copy.p0(ptr undef)
-  ret void
-}
diff --git a/llvm/test/Analysis/CostModel/ARM/load-to-trunc.ll b/llvm/test/Analysis/CostModel/ARM/load-to-trunc.ll
index 4404209..c98601f 100644
--- a/llvm/test/Analysis/CostModel/ARM/load-to-trunc.ll
+++ b/llvm/test/Analysis/CostModel/ARM/load-to-trunc.ll
@@ -1,17 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv8r-none-eabi < %s | FileCheck %s
+
 ; Check memory cost model action for a load of an unusually sized integer
 ; follow by and a trunc to a register sized integer gives a cost of 1 rather
 ; than the expanded cost if it is not.  Currently, this target does not have
 ; that expansion.
 
-; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=armv8r-none-eabi < %s | FileCheck %s --check-prefix=CHECK
-
 ; Check that cost is 1 for unusual load to register sized load.
 define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualIntegerWithTrunc'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %trunc = trunc i128 %out to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %trunc
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %out = load i128, ptr %ptr, align 8
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %trunc = trunc i128 %out to i32
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: ret i32 %trunc
 ;
   %out = load i128, ptr %ptr
   %trunc = trunc i128 %out to i32
@@ -20,8 +20,8 @@ define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
 
 define i128 @loadUnusualInteger(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualInteger'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i128 %out
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %out = load i128, ptr %ptr, align 8
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: ret i128 %out
 ;
   %out = load i128, ptr %ptr
   ret i128 %out
diff --git a/llvm/test/Analysis/CostModel/ARM/load_store.ll b/llvm/test/Analysis/CostModel/ARM/load_store.ll
index 4c322e9..dd2eaae 100644
--- a/llvm/test/Analysis/CostModel/ARM/load_store.ll
+++ b/llvm/test/Analysis/CostModel/ARM/load_store.ll
@@ -1,171 +1,117 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv6m-none-eabi < %s | FileCheck %s --check-prefix=CHECK-NOVEC
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7m-none-eabi -mcpu=cortex-m3 < %s | FileCheck %s --check-prefix=CHECK-NOVEC
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7m-none-eabi -mcpu=cortex-m4 < %s | FileCheck %s --check-prefix=CHECK-FP
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift < %s | FileCheck %s --check-prefix=CHECK-NEON
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=arm-none-eabi -mcpu=cortex-a53 < %s | FileCheck %s --check-prefix=CHECK-NEON
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8a-linux-gnueabihf < %s | FileCheck %s --check-prefix=CHECK-V8-SIZE
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE-SIZE
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv6m-none-eabi < %s | FileCheck %s --check-prefix=CHECK-NOVEC
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7m-none-eabi -mcpu=cortex-m3 < %s | FileCheck %s --check-prefix=CHECK-NOVEC
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7m-none-eabi -mcpu=cortex-m4 < %s | FileCheck %s --check-prefix=CHECK-FP
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift < %s | FileCheck %s --check-prefix=CHECK-NEON
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=arm-none-eabi -mcpu=cortex-a53 < %s | FileCheck %s --check-prefix=CHECK-NEON
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define void @stores() {
 ; CHECK-NOVEC-LABEL: 'stores'
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i8 undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store i64 undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store i128 undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store float undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store double undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x i8> undef, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x i16> undef, ptr undef, align 2
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x i32> undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <2 x i64> undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x i32> undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <8 x i16> undef, ptr undef, align 2
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <16 x i8> undef, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x float> undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <4 x double> undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x float> undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <2 x double> undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <2 x i64> undef, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x i32> undef, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <8 x i16> undef, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x float> undef, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <2 x double> undef, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of 1 for: store i8 undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of 1 for: store i16 undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of 1 for: store i32 undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store i64 undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store i128 undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of 1 for: store float undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store double undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i8> undef, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i16> undef, ptr undef, align 2
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i32> undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i64> undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x i32> undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <8 x i16> undef, ptr undef, align 2
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: store <16 x i8> undef, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x float> undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x double> undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x float> undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x double> undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i64> undef, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x i32> undef, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <8 x i16> undef, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x float> undef, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x double> undef, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-FP-LABEL: 'stores'
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i8 undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store i64 undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store i128 undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store float undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store double undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x i8> undef, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x i16> undef, ptr undef, align 2
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x i32> undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <2 x i64> undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x i32> undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <8 x i16> undef, ptr undef, align 2
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <16 x i8> undef, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x float> undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x double> undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x float> undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x double> undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <2 x i64> undef, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x i32> undef, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <8 x i16> undef, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x float> undef, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x double> undef, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-FP-NEXT:  Cost Model: Found costs of 1 for: store i8 undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of 1 for: store i16 undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of 1 for: store i32 undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store i64 undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store i128 undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of 1 for: store float undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of 1 for: store double undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i8> undef, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i16> undef, ptr undef, align 2
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i32> undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i64> undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x i32> undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <8 x i16> undef, ptr undef, align 2
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: store <16 x i8> undef, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x float> undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x double> undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x float> undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x double> undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i64> undef, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x i32> undef, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <8 x i16> undef, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x float> undef, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x double> undef, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-MVE-LABEL: 'stores'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i8 undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store i64 undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store i128 undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store float undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store double undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: store <2 x i8> undef, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: store <2 x i16> undef, ptr undef, align 2
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: store <2 x i32> undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x i64> undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x i32> undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <8 x i16> undef, ptr undef, align 2
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <16 x i8> undef, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x float> undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x double> undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: store <2 x float> undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x double> undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x i64> undef, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x i32> undef, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <8 x i16> undef, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x float> undef, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x double> undef, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: store i8 undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: store i16 undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: store i32 undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store i64 undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store i128 undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: store float undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: store double undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i8> undef, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i16> undef, ptr undef, align 2
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i32> undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i64> undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x i32> undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <8 x i16> undef, ptr undef, align 2
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <16 x i8> undef, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x float> undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x double> undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x float> undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x double> undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i64> undef, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x i32> undef, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <8 x i16> undef, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x float> undef, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x double> undef, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-NEON-LABEL: 'stores'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i8 undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store i64 undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store i128 undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store float undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store double undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: store <2 x i8> undef, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: store <2 x i16> undef, ptr undef, align 2
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i32> undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> undef, ptr undef, align 2
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i8> undef, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x float> undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <4 x double> undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x float> undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <2 x double> undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> undef, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> undef, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x float> undef, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <2 x double> undef, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; CHECK-V8-SIZE-LABEL: 'stores'
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i8 undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i64 undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i128 undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store float undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store double undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i8> undef, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i16> undef, ptr undef, align 2
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i32> undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> undef, ptr undef, align 2
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i8> undef, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x float> undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x double> undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x float> undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x double> undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> undef, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> undef, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x float> undef, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x double> undef, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-MVE-SIZE-LABEL: 'stores'
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i8 undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i64 undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i128 undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store float undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store double undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i8> undef, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i16> undef, ptr undef, align 2
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i32> undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> undef, ptr undef, align 2
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i8> undef, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x float> undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x double> undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x float> undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x double> undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> undef, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> undef, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x float> undef, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x double> undef, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store i8 undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store i16 undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store i32 undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store i64 undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store i128 undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store float undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store double undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i8> undef, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i16> undef, ptr undef, align 2
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store <2 x i32> undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store <2 x i64> undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store <8 x i16> undef, ptr undef, align 2
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store <16 x i8> undef, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store <4 x float> undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x double> undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store <2 x float> undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x double> undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store <2 x i64> undef, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> undef, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store <8 x i16> undef, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store <4 x float> undef, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x double> undef, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   store i8 undef, ptr undef, align 4
   store i16 undef, ptr undef, align 4
@@ -199,160 +145,108 @@ define void @stores() {
 
 define void @loads() {
 ; CHECK-NOVEC-LABEL: 'loads'
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = load i8, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = load i16, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = load i32, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %4 = load i64, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %5 = load i128, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = load float, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %7 = load double, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = load <2 x i8>, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %9 = load <2 x i16>, ptr undef, align 2
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %10 = load <2 x i32>, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %11 = load <2 x i64>, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %12 = load <4 x i32>, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %13 = load <8 x i16>, ptr undef, align 2
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %14 = load <16 x i8>, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %15 = load <4 x float>, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %16 = load <4 x double>, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %17 = load <2 x float>, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %18 = load <2 x double>, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %19 = load <2 x i64>, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %20 = load <4 x i32>, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %21 = load <8 x i16>, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %22 = load <4 x float>, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %23 = load <2 x double>, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %1 = load i8, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %2 = load i16, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %3 = load i32, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %4 = load i64, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %5 = load i128, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %6 = load float, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %7 = load double, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %8 = load <2 x i8>, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %9 = load <2 x i16>, ptr undef, align 2
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %10 = load <2 x i32>, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %11 = load <2 x i64>, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %12 = load <4 x i32>, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %13 = load <8 x i16>, ptr undef, align 2
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:4 SizeLat:1 for: %14 = load <16 x i8>, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %15 = load <4 x float>, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %16 = load <4 x double>, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %17 = load <2 x float>, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %18 = load <2 x double>, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %19 = load <2 x i64>, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %20 = load <4 x i32>, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %21 = load <8 x i16>, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %22 = load <4 x float>, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %23 = load <2 x double>, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-FP-LABEL: 'loads'
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = load i8, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = load i16, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = load i32, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %4 = load i64, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %5 = load i128, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = load float, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = load double, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = load <2 x i8>, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %9 = load <2 x i16>, ptr undef, align 2
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %10 = load <2 x i32>, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %11 = load <2 x i64>, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %12 = load <4 x i32>, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %13 = load <8 x i16>, ptr undef, align 2
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %14 = load <16 x i8>, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %15 = load <4 x float>, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %16 = load <4 x double>, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %17 = load <2 x float>, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %18 = load <2 x double>, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %19 = load <2 x i64>, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %20 = load <4 x i32>, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %21 = load <8 x i16>, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %22 = load <4 x float>, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %23 = load <2 x double>, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %1 = load i8, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %2 = load i16, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %3 = load i32, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %4 = load i64, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %5 = load i128, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %6 = load float, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %7 = load double, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %8 = load <2 x i8>, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %9 = load <2 x i16>, ptr undef, align 2
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %10 = load <2 x i32>, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %11 = load <2 x i64>, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %12 = load <4 x i32>, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %13 = load <8 x i16>, ptr undef, align 2
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:4 SizeLat:1 for: %14 = load <16 x i8>, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %15 = load <4 x float>, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %16 = load <4 x double>, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %17 = load <2 x float>, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %18 = load <2 x double>, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %19 = load <2 x i64>, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %20 = load <4 x i32>, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %21 = load <8 x i16>, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %22 = load <4 x float>, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %23 = load <2 x double>, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-MVE-LABEL: 'loads'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = load i8, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = load i16, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = load i32, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %4 = load i64, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %5 = load i128, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = load float, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = load double, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %8 = load <2 x i8>, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %9 = load <2 x i16>, ptr undef, align 2
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %10 = load <2 x i32>, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %11 = load <2 x i64>, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %12 = load <4 x i32>, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %13 = load <8 x i16>, ptr undef, align 2
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %14 = load <16 x i8>, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %15 = load <4 x float>, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %16 = load <4 x double>, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %17 = load <2 x float>, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %18 = load <2 x double>, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %19 = load <2 x i64>, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %20 = load <4 x i32>, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %21 = load <8 x i16>, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %22 = load <4 x float>, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %23 = load <2 x double>, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %1 = load i8, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %2 = load i16, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %3 = load i32, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %4 = load i64, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %5 = load i128, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %6 = load float, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %7 = load double, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:4 SizeLat:1 for: %8 = load <2 x i8>, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:4 SizeLat:1 for: %9 = load <2 x i16>, ptr undef, align 2
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:4 SizeLat:1 for: %10 = load <2 x i32>, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %11 = load <2 x i64>, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %12 = load <4 x i32>, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %13 = load <8 x i16>, ptr undef, align 2
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %14 = load <16 x i8>, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %15 = load <4 x float>, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %16 = load <4 x double>, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:4 SizeLat:1 for: %17 = load <2 x float>, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %18 = load <2 x double>, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %19 = load <2 x i64>, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %20 = load <4 x i32>, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %21 = load <8 x i16>, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %22 = load <4 x float>, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %23 = load <2 x double>, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-NEON-LABEL: 'loads'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = load i8, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = load i16, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = load i32, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %4 = load i64, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %5 = load i128, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = load float, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = load double, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = load <2 x i8>, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = load <2 x i16>, ptr undef, align 2
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = load <2 x i32>, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = load <2 x i64>, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = load <4 x i32>, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = load <8 x i16>, ptr undef, align 2
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = load <16 x i8>, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = load <4 x float>, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %16 = load <4 x double>, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = load <2 x float>, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %18 = load <2 x double>, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = load <2 x i64>, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = load <4 x i32>, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = load <8 x i16>, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = load <4 x float>, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %23 = load <2 x double>, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; CHECK-V8-SIZE-LABEL: 'loads'
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = load i8, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = load i16, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = load i32, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = load i64, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = load i128, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = load float, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = load double, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = load <2 x i8>, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = load <2 x i16>, ptr undef, align 2
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = load <2 x i32>, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = load <2 x i64>, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = load <4 x i32>, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = load <8 x i16>, ptr undef, align 2
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = load <16 x i8>, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = load <4 x float>, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = load <4 x double>, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = load <2 x float>, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = load <2 x double>, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = load <2 x i64>, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = load <4 x i32>, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = load <8 x i16>, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = load <4 x float>, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = load <2 x double>, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-MVE-SIZE-LABEL: 'loads'
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = load i8, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = load i16, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = load i32, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = load i64, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = load i128, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = load float, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = load double, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = load <2 x i8>, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = load <2 x i16>, ptr undef, align 2
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = load <2 x i32>, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = load <2 x i64>, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = load <4 x i32>, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = load <8 x i16>, ptr undef, align 2
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = load <16 x i8>, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = load <4 x float>, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = load <4 x double>, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = load <2 x float>, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = load <2 x double>, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = load <2 x i64>, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = load <4 x i32>, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = load <8 x i16>, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = load <4 x float>, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = load <2 x double>, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %1 = load i8, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %2 = load i16, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %3 = load i32, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %4 = load i64, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %5 = load i128, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %6 = load float, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %7 = load double, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %8 = load <2 x i8>, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %9 = load <2 x i16>, ptr undef, align 2
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %10 = load <2 x i32>, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %11 = load <2 x i64>, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %12 = load <4 x i32>, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %13 = load <8 x i16>, ptr undef, align 2
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %14 = load <16 x i8>, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %15 = load <4 x float>, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %16 = load <4 x double>, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %17 = load <2 x float>, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %18 = load <2 x double>, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %19 = load <2 x i64>, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %20 = load <4 x i32>, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %21 = load <8 x i16>, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %22 = load <4 x float>, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %23 = load <2 x double>, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   load i8, ptr undef, align 4
   load i16, ptr undef, align 4
diff --git a/llvm/test/Analysis/CostModel/ARM/logicalop.ll b/llvm/test/Analysis/CostModel/ARM/logicalop.ll
index 967426c..82eb716 100644
--- a/llvm/test/Analysis/CostModel/ARM/logicalop.ll
+++ b/llvm/test/Analysis/CostModel/ARM/logicalop.ll
@@ -1,72 +1,40 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp | FileCheck %s --check-prefix=CHECK-MVE-RECIP
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s --check-prefix=CHECK-NEON-RECIP
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.base | FileCheck %s --check-prefix=CHECK-THUMB1-RECIP
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.main | FileCheck %s --check-prefix=CHECK-THUMB2-RECIP
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp | FileCheck %s --check-prefix=CHECK-MVE-SIZE
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s --check-prefix=CHECK-NEON-SIZE
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8m.base | FileCheck %s --check-prefix=CHECK-THUMB1-SIZE
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8m.main | FileCheck %s --check-prefix=CHECK-THUMB2-SIZE
+; RUN: opt < %s  -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp | FileCheck %s --check-prefix=CHECK-MVE
+; RUN: opt < %s  -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s --check-prefix=CHECK-NEON
+; RUN: opt < %s  -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8m.base | FileCheck %s --check-prefix=CHECK-THUMB1
+; RUN: opt < %s  -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8m.main | FileCheck %s --check-prefix=CHECK-THUMB2
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define void @op() {
   ; Logical and/or - select's cost must be equivalent to that of binop
-; CHECK-MVE-RECIP-LABEL: 'op'
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sand = select i1 undef, i1 undef, i1 false
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %band = and i1 undef, undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sor = select i1 undef, i1 true, i1 undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bor = or i1 undef, undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; CHECK-NEON-RECIP-LABEL: 'op'
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sand = select i1 undef, i1 undef, i1 false
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %band = and i1 undef, undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sor = select i1 undef, i1 true, i1 undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bor = or i1 undef, undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; CHECK-THUMB1-RECIP-LABEL: 'op'
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sand = select i1 undef, i1 undef, i1 false
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %band = and i1 undef, undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sor = select i1 undef, i1 true, i1 undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bor = or i1 undef, undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-THUMB2-RECIP-LABEL: 'op'
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sand = select i1 undef, i1 undef, i1 false
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %band = and i1 undef, undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sor = select i1 undef, i1 true, i1 undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bor = or i1 undef, undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-MVE-SIZE-LABEL: 'op'
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sand = select i1 undef, i1 undef, i1 false
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %band = and i1 undef, undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %sor = select i1 undef, i1 true, i1 undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bor = or i1 undef, undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-NEON-SIZE-LABEL: 'op'
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sand = select i1 undef, i1 undef, i1 false
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %band = and i1 undef, undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %sor = select i1 undef, i1 true, i1 undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bor = or i1 undef, undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-THUMB1-SIZE-LABEL: 'op'
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sand = select i1 undef, i1 undef, i1 false
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %band = and i1 undef, undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %sor = select i1 undef, i1 true, i1 undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bor = or i1 undef, undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-THUMB2-SIZE-LABEL: 'op'
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sand = select i1 undef, i1 undef, i1 false
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %band = and i1 undef, undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %sor = select i1 undef, i1 true, i1 undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bor = or i1 undef, undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-MVE-LABEL: 'op'
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %sand = select i1 undef, i1 undef, i1 false
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %band = and i1 undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %sor = select i1 undef, i1 true, i1 undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %bor = or i1 undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-NEON-LABEL: 'op'
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %sand = select i1 undef, i1 undef, i1 false
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %band = and i1 undef, undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %sor = select i1 undef, i1 true, i1 undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %bor = or i1 undef, undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-THUMB1-LABEL: 'op'
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %sand = select i1 undef, i1 undef, i1 false
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %band = and i1 undef, undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %sor = select i1 undef, i1 true, i1 undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %bor = or i1 undef, undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of 1 for: ret void
+;
+; CHECK-THUMB2-LABEL: 'op'
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %sand = select i1 undef, i1 undef, i1 false
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %band = and i1 undef, undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %sor = select i1 undef, i1 true, i1 undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %bor = or i1 undef, undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
   %sand = select i1 undef, i1 undef, i1 false
   %band = and i1 undef, undef
@@ -77,61 +45,33 @@ define void @op() {
 }
 
 define void @vecop() {
-; CHECK-MVE-RECIP-LABEL: 'vecop'
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %band = and <4 x i1> undef, undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bor = or <4 x i1> undef, undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; CHECK-NEON-RECIP-LABEL: 'vecop'
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %band = and <4 x i1> undef, undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bor = or <4 x i1> undef, undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; CHECK-THUMB1-RECIP-LABEL: 'vecop'
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %band = and <4 x i1> undef, undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bor = or <4 x i1> undef, undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-THUMB2-RECIP-LABEL: 'vecop'
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %band = and <4 x i1> undef, undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bor = or <4 x i1> undef, undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-MVE-SIZE-LABEL: 'vecop'
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %band = and <4 x i1> undef, undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bor = or <4 x i1> undef, undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-NEON-SIZE-LABEL: 'vecop'
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %band = and <4 x i1> undef, undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bor = or <4 x i1> undef, undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-THUMB1-SIZE-LABEL: 'vecop'
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %band = and <4 x i1> undef, undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bor = or <4 x i1> undef, undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-THUMB2-SIZE-LABEL: 'vecop'
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %band = and <4 x i1> undef, undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bor = or <4 x i1> undef, undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-MVE-LABEL: 'vecop'
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %band = and <4 x i1> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %bor = or <4 x i1> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-NEON-LABEL: 'vecop'
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %band = and <4 x i1> undef, undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %bor = or <4 x i1> undef, undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-THUMB1-LABEL: 'vecop'
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of 4 for: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of 4 for: %band = and <4 x i1> undef, undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of 4 for: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of 4 for: %bor = or <4 x i1> undef, undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of 1 for: ret void
+;
+; CHECK-THUMB2-LABEL: 'vecop'
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of 4 for: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of 4 for: %band = and <4 x i1> undef, undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of 4 for: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of 4 for: %bor = or <4 x i1> undef, undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
   %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> <i1 false, i1 false, i1 false, i1 false>
   %band = and <4 x i1> undef, undef
diff --git a/llvm/test/Analysis/CostModel/ARM/mul-cast-vect.ll b/llvm/test/Analysis/CostModel/ARM/mul-cast-vect.ll
index 17d4263..07d2bd0 100644
--- a/llvm/test/Analysis/CostModel/ARM/mul-cast-vect.ll
+++ b/llvm/test/Analysis/CostModel/ARM/mul-cast-vect.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
+; RUN: opt < %s  -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
+
 ; To see the assembly output: llc -mcpu=cortex-a9 < %s | FileCheck --check-prefix=ASM %s
 ; ASM lines below are only for reference, tests on that direction should go to tests/CodeGen/ARM
 
@@ -17,11 +18,11 @@ target triple = "armv7--linux-gnueabihf"
 
 define void @direct(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'direct'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i32>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r3 = mul <4 x i32> %v0, %v1
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> %r3, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = mul <4 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T432, ptr %loadaddr
 ; ASM: vld1.64
@@ -36,13 +37,13 @@ define void @direct(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @ups1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'ups1632'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i16>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r1 = sext <4 x i16> %v0 to <4 x i32>
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r2 = sext <4 x i16> %v1 to <4 x i32>
-; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r3 = mul <4 x i32> %r1, %r2
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> %r3, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i16>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r1 = sext <4 x i16> %v0 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r2 = sext <4 x i16> %v1 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = mul <4 x i32> %r1, %r2
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T416, ptr %loadaddr
 ; ASM: vldr
@@ -59,13 +60,13 @@ define void @ups1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @upu1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'upu1632'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i16>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext <4 x i16> %v0 to <4 x i32>
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r2 = zext <4 x i16> %v1 to <4 x i32>
-; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r3 = mul <4 x i32> %r1, %r2
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> %r3, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i16>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r1 = zext <4 x i16> %v0 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r2 = zext <4 x i16> %v1 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = mul <4 x i32> %r1, %r2
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T416, ptr %loadaddr
 ; ASM: vldr
@@ -82,12 +83,12 @@ define void @upu1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @ups3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'ups3264'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <2 x i32>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = mul <2 x i32> %v0, %v1
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %st = sext <2 x i32> %r3 to <2 x i64>
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> %st, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <2 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = mul <2 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: %st = sext <2 x i32> %r3 to <2 x i64>
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <2 x i64> %st, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T232, ptr %loadaddr
 ; ASM: vldr
@@ -104,12 +105,12 @@ define void @ups3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @upu3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'upu3264'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <2 x i32>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = mul <2 x i32> %v0, %v1
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %st = zext <2 x i32> %r3 to <2 x i64>
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> %st, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <2 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = mul <2 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: %st = zext <2 x i32> %r3 to <2 x i64>
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <2 x i64> %st, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T232, ptr %loadaddr
 ; ASM: vldr
@@ -126,12 +127,12 @@ define void @upu3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @dn3216(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'dn3216'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i32>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r3 = mul <4 x i32> %v0, %v1
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %st = trunc <4 x i32> %r3 to <4 x i16>
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i16> %st, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = mul <4 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: %st = trunc <4 x i32> %r3 to <4 x i16>
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i16> %st, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T432, ptr %loadaddr
 ; ASM: vld1.64
diff --git a/llvm/test/Analysis/CostModel/ARM/muls-in-smlal-patterns.ll b/llvm/test/Analysis/CostModel/ARM/muls-in-smlal-patterns.ll
index 7de2799..3ae02cd 100644
--- a/llvm/test/Analysis/CostModel/ARM/muls-in-smlal-patterns.ll
+++ b/llvm/test/Analysis/CostModel/ARM/muls-in-smlal-patterns.ll
@@ -1,21 +1,21 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple thumbv8.1-m.main -mattr=+dsp  < %s | FileCheck %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple thumbv8.1-m.main < %s | FileCheck %s --check-prefix=CHECK-NO-DSP
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple thumbv8.1-m.main -mattr=+dsp  < %s | FileCheck %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple thumbv8.1-m.main < %s | FileCheck %s --check-prefix=CHECK-NO-DSP
 
 define i64 @test(i16 %a, i16 %b) {
 ; CHECK-LABEL: 'test'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = sext i16 %b to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %as = sext i16 %a to i32
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %bs = sext i16 %b to i32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %m = mul i32 %as, %bs
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ms = sext i32 %m to i64
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: ret i64 %ms
 ;
 ; CHECK-NO-DSP-LABEL: 'test'
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = sext i16 %b to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %as = sext i16 %a to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %bs = sext i16 %b to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %ms = sext i32 %m to i64
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: ret i64 %ms
 ;
     %as = sext i16 %a to i32
     %bs = sext i16 %b to i32
@@ -26,20 +26,20 @@ define i64 @test(i16 %a, i16 %b) {
 
 define i64 @withadd(i16 %a, i16 %b, i64 %c) {
 ; CHECK-LABEL: 'withadd'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = sext i16 %b to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %as = sext i16 %a to i32
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %bs = sext i16 %b to i32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %m = mul i32 %as, %bs
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ms = sext i32 %m to i64
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = add i64 %c, %ms
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: ret i64 %r
 ;
 ; CHECK-NO-DSP-LABEL: 'withadd'
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = sext i16 %b to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %as = sext i16 %a to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %bs = sext i16 %b to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %ms = sext i32 %m to i64
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 2 for: %r = add i64 %c, %ms
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: ret i64 %r
 ;
     %as = sext i16 %a to i32
     %bs = sext i16 %b to i32
@@ -51,24 +51,24 @@ define i64 @withadd(i16 %a, i16 %b, i64 %c) {
 
 define i64 @withloads(ptr %pa, ptr %pb, i64 %c) {
 ; CHECK-LABEL: 'withloads'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, ptr %pa, align 2
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b = load i16, ptr %pb, align 2
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %as = sext i16 %a to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bs = sext i16 %b to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %a = load i16, ptr %pa, align 2
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %b = load i16, ptr %pb, align 2
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %as = sext i16 %a to i32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %bs = sext i16 %b to i32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %m = mul i32 %as, %bs
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ms = sext i32 %m to i64
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = add i64 %c, %ms
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: ret i64 %r
 ;
 ; CHECK-NO-DSP-LABEL: 'withloads'
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, ptr %pa, align 2
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b = load i16, ptr %pb, align 2
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %as = sext i16 %a to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bs = sext i16 %b to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %a = load i16, ptr %pa, align 2
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %b = load i16, ptr %pb, align 2
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 0 for: %as = sext i16 %a to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 0 for: %bs = sext i16 %b to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %ms = sext i32 %m to i64
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 2 for: %r = add i64 %c, %ms
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: ret i64 %r
 ;
     %a = load i16, ptr %pa
     %b = load i16, ptr %pb
@@ -82,18 +82,18 @@ define i64 @withloads(ptr %pa, ptr %pb, i64 %c) {
 
 define i64 @different_extend_ops(i16 %a, i16 %b) {
 ; CHECK-LABEL: 'different_extend_ops'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %as = sext i16 %a to i32
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %bs = zext i16 %b to i32
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %m = mul i32 %as, %bs
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ms = sext i32 %m to i64
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: ret i64 %ms
 ;
 ; CHECK-NO-DSP-LABEL: 'different_extend_ops'
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %as = sext i16 %a to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %bs = zext i16 %b to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %ms = sext i32 %m to i64
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: ret i64 %ms
 ;
     %as = sext i16 %a to i32
     %bs = zext i16 %b to i32
diff --git a/llvm/test/Analysis/CostModel/ARM/muls-in-umull-patterns.ll b/llvm/test/Analysis/CostModel/ARM/muls-in-umull-patterns.ll
index 521816d13..04a9520 100644
--- a/llvm/test/Analysis/CostModel/ARM/muls-in-umull-patterns.ll
+++ b/llvm/test/Analysis/CostModel/ARM/muls-in-umull-patterns.ll
@@ -1,20 +1,21 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple thumbv8.1-m.main -mattr=+dsp  < %s | FileCheck %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple thumbv8.1-m.main < %s | FileCheck %s --check-prefix=CHECK-NO-DSP
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple thumbv8.1-m.main -mattr=+dsp  < %s | FileCheck %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple thumbv8.1-m.main < %s | FileCheck %s --check-prefix=CHECK-NO-DSP
+
 define i64 @test(i16 %a, i16 %b) {
 ; CHECK-LABEL: 'test'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = zext i16 %a to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %as = zext i16 %a to i32
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %bs = zext i16 %b to i32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %m = mul i32 %as, %bs
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ms = zext i32 %m to i64
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: ret i64 %ms
 ;
 ; CHECK-NO-DSP-LABEL: 'test'
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = zext i16 %a to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %as = zext i16 %a to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %bs = zext i16 %b to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %ms = zext i32 %m to i64
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: ret i64 %ms
 ;
     %as = zext i16 %a to i32
     %bs = zext i16 %b to i32
@@ -25,20 +26,20 @@ define i64 @test(i16 %a, i16 %b) {
 
 define i64 @withadd(i16 %a, i16 %b, i64 %c) {
 ; CHECK-LABEL: 'withadd'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = zext i16 %a to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %as = zext i16 %a to i32
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %bs = zext i16 %b to i32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %m = mul i32 %as, %bs
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ms = zext i32 %m to i64
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = add i64 %c, %ms
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: ret i64 %r
 ;
 ; CHECK-NO-DSP-LABEL: 'withadd'
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = zext i16 %a to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %as = zext i16 %a to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %bs = zext i16 %b to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %ms = zext i32 %m to i64
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 2 for: %r = add i64 %c, %ms
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: ret i64 %r
 ;
     %as = zext i16 %a to i32
     %bs = zext i16 %b to i32
@@ -50,24 +51,24 @@ define i64 @withadd(i16 %a, i16 %b, i64 %c) {
 
 define i64 @withloads(ptr %pa, ptr %pb, i64 %c) {
 ; CHECK-LABEL: 'withloads'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, ptr %pa, align 2
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b = load i16, ptr %pb, align 2
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %as = zext i16 %a to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bs = zext i16 %b to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %a = load i16, ptr %pa, align 2
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %b = load i16, ptr %pb, align 2
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %as = zext i16 %a to i32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %bs = zext i16 %b to i32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %m = mul i32 %as, %bs
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ms = zext i32 %m to i64
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = add i64 %c, %ms
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: ret i64 %r
 ;
 ; CHECK-NO-DSP-LABEL: 'withloads'
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, ptr %pa, align 2
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b = load i16, ptr %pb, align 2
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %as = zext i16 %a to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bs = zext i16 %b to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %a = load i16, ptr %pa, align 2
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %b = load i16, ptr %pb, align 2
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 0 for: %as = zext i16 %a to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 0 for: %bs = zext i16 %b to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %ms = zext i32 %m to i64
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 2 for: %r = add i64 %c, %ms
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: ret i64 %r
 ;
     %a = load i16, ptr %pa
     %b = load i16, ptr %pb
diff --git a/llvm/test/Analysis/CostModel/ARM/mve-intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/ARM/mve-intrinsic-cost-kinds.ll
new file mode 100644
index 0000000..b3ad818
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/ARM/mve-intrinsic-cost-kinds.ll
@@ -0,0 +1,181 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -mtriple=armv8.1m.main -mattr=+mve.fp -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output   < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+
+; Test a cross-section of intrinsics for various cost-kinds.
+; Other test files may check for accuracy of a particular intrinsic
+; across subtargets or types. This is just a basic correctness check using an
+; ARM target and a legal scalar type (i32/float) and/or an
+; illegal vector type (16 x i32/float).
+
+declare i32 @llvm.smax.i32(i32, i32)
+declare <16 x i32> @llvm.smax.v16i32(<16 x i32>, <16 x i32>)
+
+declare float @llvm.fmuladd.f32(float, float, float)
+declare <16 x float> @llvm.fmuladd.v16f32(<16 x float>, <16 x float>, <16 x float>)
+
+declare float @llvm.log2.f32(float)
+declare <16 x float> @llvm.log2.v16f32(<16 x float>)
+
+declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata)
+declare <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float>, <16 x float>, metadata, metadata)
+
+declare float @llvm.maximum.f32(float, float)
+declare <16 x float> @llvm.maximum.v16f32(<16 x float>, <16 x float>)
+
+declare i32 @llvm.cttz.i32(i32, i1)
+declare <16 x i32> @llvm.cttz.v16i32(<16 x i32>, i1)
+
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare <16 x i32> @llvm.ctlz.v16i32(<16 x i32>, i1)
+
+declare i32 @llvm.fshl.i32(i32, i32, i32)
+declare <16 x i32> @llvm.fshl.v16i32(<16 x i32>, <16 x i32>, <16 x i32>)
+
+declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>)
+declare void @llvm.masked.scatter.v16f32.v16p0(<16 x float>, <16 x ptr>, i32, <16 x i1>)
+declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>)
+
+declare void @llvm.memcpy.p0.p0.i32(ptr, ptr, i32, i1)
+
+declare i32 @llvm.ssa.copy.i32(i32)
+declare float @llvm.ssa.copy.f32(float)
+declare ptr @llvm.ssa.copy.p0(ptr)
+
+define void @smax(i32 %a, i32 %b, <16 x i32> %va, <16 x i32> %vb) {
+; CHECK-LABEL: 'smax'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %s = call i32 @llvm.smax.i32(i32 %a, i32 %b)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %s = call i32 @llvm.smax.i32(i32 %a, i32 %b)
+  %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb)
+  ret void
+}
+
+define void @fmuladd(float %a, float %b, float %c, <16 x float> %va, <16 x float> %vb, <16 x float> %vc) {
+; CHECK-LABEL: 'fmuladd'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
+; CHECK-NEXT:  Cost Model: Found costs of 8 for: %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
+  %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc)
+  ret void
+}
+
+define void @log2(float %a, <16 x float> %va) {
+; CHECK-LABEL: 'log2'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %s = call float @llvm.log2.f32(float %a)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:192 CodeSize:48 Lat:192 SizeLat:192 for: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %s = call float @llvm.log2.f32(float %a)
+  %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va)
+  ret void
+}
+
+define void @constrained_fadd(float %a, <16 x float> %va) strictfp {
+; CHECK-LABEL: 'constrained_fadd'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s = call float @llvm.experimental.constrained.fadd.f32(float %a, float %a, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+; CHECK-NEXT:  Cost Model: Found costs of 48 for: %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %s = call float @llvm.experimental.constrained.fadd.f32(float %a, float %a, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+  %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+  ret void
+}
+
+define void @fmaximum(float %a, float %b, <16 x float> %va, <16 x float> %vb) {
+; CHECK-LABEL: 'fmaximum'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %s = call float @llvm.maximum.f32(float %a, float %b)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:208 CodeSize:64 Lat:208 SizeLat:208 for: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %s = call float @llvm.maximum.f32(float %a, float %b)
+  %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb)
+  ret void
+}
+
+define void @cttz(i32 %a, <16 x i32> %va) {
+; CHECK-LABEL: 'cttz'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; CHECK-NEXT:  Cost Model: Found costs of 8 for: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+  %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
+  ret void
+}
+
+define void @ctlz(i32 %a, <16 x i32> %va) {
+; CHECK-LABEL: 'ctlz'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s = call i32 @llvm.ctlz.i32(i32 %a, i1 true)
+; CHECK-NEXT:  Cost Model: Found costs of 8 for: %v = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %va, i1 true)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %s = call i32 @llvm.ctlz.i32(i32 %a, i1 true)
+  %v = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %va, i1 true)
+  ret void
+}
+
+define void @fshl(i32 %a, i32 %b, i32 %c, <16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc) {
+; CHECK-LABEL: 'fshl'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:8 Lat:7 SizeLat:7 for: %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:120 CodeSize:92 Lat:120 SizeLat:120 for: %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
+  %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc)
+  ret void
+}
+
+define void @maskedgather(<16 x ptr> %va, <16 x i1> %vb, <16 x float> %vc) {
+; CHECK-LABEL: 'maskedgather'
+; CHECK-NEXT:  Cost Model: Found costs of 176 for: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 %va, <16 x i1> %vb, <16 x float> %vc)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %va, i32 1, <16 x i1> %vb, <16 x float> %vc)
+  ret void
+}
+
+define void @maskedscatter(<16 x float> %va, <16 x ptr> %vb, <16 x i1> %vc) {
+; CHECK-LABEL: 'maskedscatter'
+; CHECK-NEXT:  Cost Model: Found costs of 176 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> align 1 %vb, <16 x i1> %vc)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> %vb, i32 1, <16 x i1> %vc)
+  ret void
+}
+
+define void @reduce_fmax(<16 x float> %va) {
+; CHECK-LABEL: 'reduce_fmax'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:6 Lat:9 SizeLat:9 for: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
+  ret void
+}
+
+define void @memcpy(ptr %a, ptr %b, i32 %c) {
+; CHECK-LABEL: 'memcpy'
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: call void @llvm.memcpy.p0.p0.i32(ptr align 1 %a, ptr align 1 %b, i32 32, i1 false)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  call void @llvm.memcpy.p0.p0.i32(ptr align 1 %a, ptr align 1 %b, i32 32, i1 false)
+  ret void
+}
+
+define void @ssa_copy() {
+; CHECK-LABEL: 'ssa_copy'
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %i = call i32 @llvm.ssa.copy.i32(i32 undef)
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %f = call float @llvm.ssa.copy.f32(float undef)
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %p = call ptr @llvm.ssa.copy.p0(ptr undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %i = call i32 @llvm.ssa.copy.i32(i32 undef)
+  %f = call float @llvm.ssa.copy.f32(float undef)
+  %p = call ptr @llvm.ssa.copy.p0(ptr undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/ARM/mve-target-intrinsics.ll b/llvm/test/Analysis/CostModel/ARM/mve-target-intrinsics.ll
new file mode 100644
index 0000000..d09216a
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/ARM/mve-target-intrinsics.ll
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main -mattr=+mve | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+
+define void @intrinsics() {
+; CHECK-LABEL: 'intrinsics'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %t1 = call i32 @llvm.arm.ssat(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %t2 = tail call { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0(ptr undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %t3 = call { i32, i32 } @llvm.arm.mve.sqrshrl(i32 undef, i32 undef, i32 undef, i32 48)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %t4 = tail call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i16> undef, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %t1 = call i32 @llvm.arm.ssat(i32 undef, i32 undef)
+  %t2 = tail call { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0(ptr undef)
+  %t3 = call { i32, i32 } @llvm.arm.mve.sqrshrl(i32 undef, i32 undef, i32 undef, i32 48)
+  %t4 = tail call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i16> undef, <8 x i16> undef)
+  ret void
+}
+
+declare i32 @llvm.arm.ssat(i32, i32)
+declare { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0(ptr)
+declare { i32, i32 } @llvm.arm.mve.sqrshrl(i32, i32, i32, i32)
+declare { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32, i32, i32, i32, i32, <8 x i16>, <8 x i16>)
diff --git a/llvm/test/Analysis/CostModel/ARM/select.ll b/llvm/test/Analysis/CostModel/ARM/select.ll
index f626901..de429fd2 100644
--- a/llvm/test/Analysis/CostModel/ARM/select.ll
+++ b/llvm/test/Analysis/CostModel/ARM/select.ll
@@ -1,272 +1,140 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp | FileCheck %s --check-prefix=CHECK-MVE-RECIP
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s --check-prefix=CHECK-NEON-RECIP
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.base | FileCheck %s --check-prefix=CHECK-THUMB1-RECIP
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.main | FileCheck %s --check-prefix=CHECK-THUMB2-RECIP
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp | FileCheck %s --check-prefix=CHECK-MVE-SIZE
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s --check-prefix=CHECK-NEON-SIZE
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8m.base | FileCheck %s --check-prefix=CHECK-THUMB1-SIZE
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8m.main | FileCheck %s --check-prefix=CHECK-THUMB2-SIZE
+; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp | FileCheck %s --check-prefix=CHECK-MVE
+; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s --check-prefix=CHECK-NEON
+; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mtriple=thumbv8m.base | FileCheck %s --check-prefix=CHECK-THUMB1
+; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mtriple=thumbv8m.main | FileCheck %s --check-prefix=CHECK-THUMB2
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define void @selects() {
   ; Scalar values
-; CHECK-MVE-RECIP-LABEL: 'selects'
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = select i1 undef, i1 undef, i1 undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5 = select i1 undef, float undef, float undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-LABEL: 'selects'
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %v0 = select i1 undef, i1 undef, i1 undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v1 = select i1 undef, i8 undef, i8 undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v2 = select i1 undef, i16 undef, i16 undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v3 = select i1 undef, i32 undef, i32 undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %v4 = select i1 undef, i64 undef, i64 undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v5 = select i1 undef, float undef, float undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v6 = select i1 undef, double undef, double undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:6 Lat:10 SizeLat:10 for: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-; CHECK-NEON-RECIP-LABEL: 'selects'
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = select i1 undef, i1 undef, i1 undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5 = select i1 undef, float undef, float undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEON-LABEL: 'selects'
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %v0 = select i1 undef, i1 undef, i1 undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v1 = select i1 undef, i8 undef, i8 undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v2 = select i1 undef, i16 undef, i16 undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v3 = select i1 undef, i32 undef, i32 undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %v4 = select i1 undef, i64 undef, i64 undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v5 = select i1 undef, float undef, float undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v6 = select i1 undef, double undef, double undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 4 for: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 19 for: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 50 for: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 100 for: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 4 for: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-; CHECK-THUMB1-RECIP-LABEL: 'selects'
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = select i1 undef, i1 undef, i1 undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5 = select i1 undef, float undef, float undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-THUMB1-LABEL: 'selects'
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %v0 = select i1 undef, i1 undef, i1 undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v1 = select i1 undef, i8 undef, i8 undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v2 = select i1 undef, i16 undef, i16 undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v3 = select i1 undef, i32 undef, i32 undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %v4 = select i1 undef, i64 undef, i64 undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v5 = select i1 undef, float undef, float undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %v6 = select i1 undef, double undef, double undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:4 SizeLat:4 for: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:12 Lat:8 SizeLat:8 for: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:24 Lat:16 SizeLat:16 for: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:48 Lat:32 SizeLat:32 for: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:4 SizeLat:4 for: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:12 Lat:8 SizeLat:8 for: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:24 Lat:16 SizeLat:16 for: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:48 Lat:32 SizeLat:32 for: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:4 SizeLat:4 for: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:12 Lat:8 SizeLat:8 for: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:24 Lat:16 SizeLat:16 for: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:48 Lat:32 SizeLat:32 for: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:8 SizeLat:8 for: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:16 SizeLat:16 for: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:32 SizeLat:32 for: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:64 SizeLat:64 for: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:4 SizeLat:4 for: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:12 Lat:8 SizeLat:8 for: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:8 SizeLat:8 for: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:9 Lat:6 SizeLat:6 for: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:20 SizeLat:20 for: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
-; CHECK-THUMB2-RECIP-LABEL: 'selects'
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = select i1 undef, i1 undef, i1 undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5 = select i1 undef, float undef, float undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-MVE-SIZE-LABEL: 'selects'
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v0 = select i1 undef, i1 undef, i1 undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5 = select i1 undef, float undef, float undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-NEON-SIZE-LABEL: 'selects'
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v0 = select i1 undef, i1 undef, i1 undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5 = select i1 undef, float undef, float undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-THUMB1-SIZE-LABEL: 'selects'
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v0 = select i1 undef, i1 undef, i1 undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5 = select i1 undef, float undef, float undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-THUMB2-SIZE-LABEL: 'selects'
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v0 = select i1 undef, i1 undef, i1 undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5 = select i1 undef, float undef, float undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-THUMB2-LABEL: 'selects'
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %v0 = select i1 undef, i1 undef, i1 undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v1 = select i1 undef, i8 undef, i8 undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v2 = select i1 undef, i16 undef, i16 undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v3 = select i1 undef, i32 undef, i32 undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %v4 = select i1 undef, i64 undef, i64 undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v5 = select i1 undef, float undef, float undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %v6 = select i1 undef, double undef, double undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:4 SizeLat:4 for: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:12 Lat:8 SizeLat:8 for: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:24 Lat:16 SizeLat:16 for: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:48 Lat:32 SizeLat:32 for: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:4 SizeLat:4 for: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:12 Lat:8 SizeLat:8 for: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:24 Lat:16 SizeLat:16 for: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:48 Lat:32 SizeLat:32 for: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:4 SizeLat:4 for: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:12 Lat:8 SizeLat:8 for: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:24 Lat:16 SizeLat:16 for: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:48 Lat:32 SizeLat:32 for: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:8 SizeLat:8 for: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:16 SizeLat:16 for: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:32 SizeLat:32 for: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:64 SizeLat:64 for: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:4 SizeLat:4 for: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:12 Lat:8 SizeLat:8 for: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:8 SizeLat:8 for: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:9 Lat:6 SizeLat:6 for: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:20 SizeLat:20 for: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
   %v0 = select i1 undef, i1 undef, i1 undef
   %v1 = select i1 undef, i8 undef, i8 undef
diff --git a/llvm/test/Analysis/CostModel/ARM/shl-cast-vect.ll b/llvm/test/Analysis/CostModel/ARM/shl-cast-vect.ll
index 465611d..5ef869e 100644
--- a/llvm/test/Analysis/CostModel/ARM/shl-cast-vect.ll
+++ b/llvm/test/Analysis/CostModel/ARM/shl-cast-vect.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
+; RUN: opt < %s  -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
+
 ; To see the assembly output: llc -mcpu=cortex-a9 < %s | FileCheck --check-prefix=ASM %s
 ; ASM lines below are only for reference, tests on that direction should go to tests/CodeGen/ARM
 
@@ -17,11 +18,11 @@ target triple = "armv7--linux-gnueabihf"
 
 define void @direct(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'direct'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i32>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r3 = shl <4 x i32> %v0, %v1
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> %r3, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = shl <4 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T432, ptr %loadaddr
 ; ASM: vld1.64
@@ -36,13 +37,13 @@ define void @direct(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @ups1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'ups1632'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i16>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r1 = sext <4 x i16> %v0 to <4 x i32>
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r2 = sext <4 x i16> %v1 to <4 x i32>
-; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r3 = shl <4 x i32> %r1, %r2
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> %r3, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i16>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r1 = sext <4 x i16> %v0 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r2 = sext <4 x i16> %v1 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = shl <4 x i32> %r1, %r2
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T416, ptr %loadaddr
 ; ASM: vldr
@@ -59,13 +60,13 @@ define void @ups1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @upu1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'upu1632'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i16>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext <4 x i16> %v0 to <4 x i32>
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r2 = zext <4 x i16> %v1 to <4 x i32>
-; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r3 = shl <4 x i32> %r1, %r2
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> %r3, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i16>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r1 = zext <4 x i16> %v0 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r2 = zext <4 x i16> %v1 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = shl <4 x i32> %r1, %r2
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T416, ptr %loadaddr
 ; ASM: vldr
@@ -82,12 +83,12 @@ define void @upu1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @ups3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'ups3264'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <2 x i32>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r3 = shl <2 x i32> %v0, %v1
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %st = sext <2 x i32> %r3 to <2 x i64>
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> %st, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <2 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = shl <2 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: %st = sext <2 x i32> %r3 to <2 x i64>
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <2 x i64> %st, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T232, ptr %loadaddr
 ; ASM: vldr
@@ -104,12 +105,12 @@ define void @ups3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @upu3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'upu3264'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <2 x i32>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r3 = shl <2 x i32> %v0, %v1
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %st = zext <2 x i32> %r3 to <2 x i64>
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> %st, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <2 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = shl <2 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: %st = zext <2 x i32> %r3 to <2 x i64>
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <2 x i64> %st, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T232, ptr %loadaddr
 ; ASM: vldr
@@ -126,12 +127,12 @@ define void @upu3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @dn3216(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'dn3216'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i32>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r3 = shl <4 x i32> %v0, %v1
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %st = trunc <4 x i32> %r3 to <4 x i16>
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i16> %st, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = shl <4 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: %st = trunc <4 x i32> %r3 to <4 x i16>
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i16> %st, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T432, ptr %loadaddr
 ; ASM: vld1.64
diff --git a/llvm/test/Analysis/CostModel/ARM/shuffle.ll b/llvm/test/Analysis/CostModel/ARM/shuffle.ll
index a17bbab..0fd1456 100644
--- a/llvm/test/Analysis/CostModel/ARM/shuffle.ll
+++ b/llvm/test/Analysis/CostModel/ARM/shuffle.ll
@@ -1,59 +1,59 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp | FileCheck %s --check-prefix=CHECK-MVE
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s --check-prefix=CHECK-NEON
+; RUN: opt < %s  -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp | FileCheck %s --check-prefix=CHECK-MVE
+; RUN: opt < %s  -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s --check-prefix=CHECK-NEON
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define void @broadcast() {
 ; CHECK-MVE-LABEL: 'broadcast'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:12 Lat:24 SizeLat:24 for: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:12 Lat:24 SizeLat:24 for: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:12 Lat:24 SizeLat:24 for: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:24 Lat:48 SizeLat:48 for: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:80 CodeSize:40 Lat:80 SizeLat:80 for: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:6 for: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:5 Lat:10 SizeLat:10 for: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-NEON-LABEL: 'broadcast'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 8 for: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 14 for: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 26 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 50 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer
   %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> zeroinitializer
@@ -90,58 +90,58 @@ define void @broadcast() {
 ;; Reverse shuffles should be lowered to vrev and possibly a vext (for quadwords, on neon)
 define void @reverse() {
 ; CHECK-MVE-LABEL: 'reverse'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v3i24 = shufflevector <3 x i24> undef, <3 x i24> undef, <3 x i32> <i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v3i124 = shufflevector <3 x i124> undef, <3 x i124> undef, <3 x i32> <i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:24 Lat:48 SizeLat:48 for: %v3i24 = shufflevector <3 x i24> undef, <3 x i24> undef, <3 x i32> <i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:192 CodeSize:96 Lat:192 SizeLat:192 for: %v3i124 = shufflevector <3 x i124> undef, <3 x i124> undef, <3 x i32> <i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-NEON-LABEL: 'reverse'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3i24 = shufflevector <3 x i24> undef, <3 x i24> undef, <3 x i32> <i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v3i124 = shufflevector <3 x i124> undef, <3 x i124> undef, <3 x i32> <i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 4 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 4 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 10 for: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 20 for: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 40 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 80 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 4 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v3i24 = shufflevector <3 x i24> undef, <3 x i24> undef, <3 x i32> <i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 18 for: %v3i124 = shufflevector <3 x i124> undef, <3 x i124> undef, <3 x i32> <i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
   %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -179,40 +179,40 @@ define void @reverse() {
 
 define void @concat() {
 ; CHECK-MVE-LABEL: 'concat'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v4i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v4i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v4i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v4i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = shufflevector <2 x half> undef, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = shufflevector <4 x half> undef, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = shufflevector <8 x half> undef, <8 x half> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = shufflevector <2 x float> undef, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = shufflevector <4 x float> undef, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = shufflevector <2 x double> undef, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v4i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v4i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v4i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v4i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v4f16 = shufflevector <2 x half> undef, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v8f16 = shufflevector <4 x half> undef, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v16f16 = shufflevector <8 x half> undef, <8 x half> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v4f32 = shufflevector <2 x float> undef, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v8f32 = shufflevector <4 x float> undef, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v4f64 = shufflevector <2 x double> undef, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-NEON-LABEL: 'concat'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v4f16 = shufflevector <2 x half> undef, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v8f16 = shufflevector <4 x half> undef, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v16f16 = shufflevector <8 x half> undef, <8 x half> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v4f32 = shufflevector <2 x float> undef, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v8f32 = shufflevector <4 x float> undef, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = shufflevector <2 x double> undef, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 12 for: %v4i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 24 for: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 48 for: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 12 for: %v4i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 24 for: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 48 for: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 12 for: %v4i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 24 for: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 12 for: %v4i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 10 for: %v4f16 = shufflevector <2 x half> undef, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 20 for: %v8f16 = shufflevector <4 x half> undef, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 40 for: %v16f16 = shufflevector <8 x half> undef, <8 x half> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 10 for: %v4f32 = shufflevector <2 x float> undef, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 20 for: %v8f32 = shufflevector <4 x float> undef, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 4 for: %v4f64 = shufflevector <2 x double> undef, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v4i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -241,54 +241,54 @@ define void @concat() {
 
 define void @select() {
 ; CHECK-MVE-LABEL: 'select'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 3>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 3>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-NEON-LABEL: 'select'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 3>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 6 for: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 3>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 48 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 4 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 10 for: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 20 for: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 40 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 80 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 4 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 3>
   %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
@@ -324,40 +324,40 @@ define void @select() {
 
 define void @vrev2() {
 ; CHECK-MVE-LABEL: 'vrev2'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-NEON-LABEL: 'vrev2'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 24 for: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 48 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 96 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 24 for: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 48 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 96 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 24 for: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 48 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 24 for: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 20 for: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 40 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 80 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 20 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 40 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 8 for: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
@@ -386,26 +386,26 @@ define void @vrev2() {
 
 define void @vrev4() {
 ; CHECK-MVE-LABEL: 'vrev4'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-NEON-LABEL: 'vrev4'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 48 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 96 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 48 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 96 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 48 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 40 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 80 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 40 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
   %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
diff --git a/llvm/test/Analysis/CostModel/ARM/sub-cast-vect.ll b/llvm/test/Analysis/CostModel/ARM/sub-cast-vect.ll
index df26121..924a629 100644
--- a/llvm/test/Analysis/CostModel/ARM/sub-cast-vect.ll
+++ b/llvm/test/Analysis/CostModel/ARM/sub-cast-vect.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
+; RUN: opt < %s  -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
+
 ; To see the assembly output: llc -mcpu=cortex-a9 < %s | FileCheck --check-prefix=ASM %s
 ; ASM lines below are only for reference, tests on that direction should go to tests/CodeGen/ARM
 
@@ -17,11 +18,11 @@ target triple = "armv7--linux-gnueabihf"
 
 define void @direct(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'direct'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i32>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = sub <4 x i32> %v0, %v1
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> %r3, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = sub <4 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T432, ptr %loadaddr
 ; ASM: vld1.64
@@ -36,13 +37,13 @@ define void @direct(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @ups1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'ups1632'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i16>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r1 = sext <4 x i16> %v0 to <4 x i32>
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r2 = sext <4 x i16> %v1 to <4 x i32>
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = sub <4 x i32> %r1, %r2
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> %r3, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i16>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r1 = sext <4 x i16> %v0 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r2 = sext <4 x i16> %v1 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = sub <4 x i32> %r1, %r2
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T416, ptr %loadaddr
 ; ASM: vldr
@@ -59,13 +60,13 @@ define void @ups1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @upu1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'upu1632'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i16>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext <4 x i16> %v0 to <4 x i32>
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r2 = zext <4 x i16> %v1 to <4 x i32>
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = sub <4 x i32> %r1, %r2
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> %r3, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i16>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r1 = zext <4 x i16> %v0 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r2 = zext <4 x i16> %v1 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = sub <4 x i32> %r1, %r2
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T416, ptr %loadaddr
 ; ASM: vldr
@@ -82,12 +83,12 @@ define void @upu1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @ups3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'ups3264'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <2 x i32>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = sub <2 x i32> %v0, %v1
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %st = sext <2 x i32> %r3 to <2 x i64>
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> %st, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <2 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = sub <2 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: %st = sext <2 x i32> %r3 to <2 x i64>
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <2 x i64> %st, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T232, ptr %loadaddr
 ; ASM: vldr
@@ -104,12 +105,12 @@ define void @ups3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @upu3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'upu3264'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <2 x i32>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = sub <2 x i32> %v0, %v1
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %st = zext <2 x i32> %r3 to <2 x i64>
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> %st, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <2 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = sub <2 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: %st = zext <2 x i32> %r3 to <2 x i64>
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <2 x i64> %st, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T232, ptr %loadaddr
 ; ASM: vldr
@@ -126,12 +127,12 @@ define void @upu3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @dn3216(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'dn3216'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i32>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = sub <4 x i32> %v0, %v1
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %st = trunc <4 x i32> %r3 to <4 x i16>
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i16> %st, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = sub <4 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: %st = trunc <4 x i32> %r3 to <4 x i16>
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i16> %st, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T432, ptr %loadaddr
 ; ASM: vld1.64
diff --git a/llvm/test/Analysis/CostModel/ARM/target-intrinsics.ll b/llvm/test/Analysis/CostModel/ARM/target-intrinsics.ll
deleted file mode 100644
index 2d6b318..0000000
--- a/llvm/test/Analysis/CostModel/ARM/target-intrinsics.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main -mattr=+mve | FileCheck %s --check-prefix=CHECK-THUMB2-RECIP
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=throughput -mtriple=thumbv8.1m.main -mattr=+mve | FileCheck %s --check-prefix=CHECK-THUMB2-RECIP
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mtriple=thumbv8.1m.main -mattr=+mve | FileCheck %s --check-prefix=CHECK-THUMB2-LAT
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8.1m.main -mattr=+mve | FileCheck %s --check-prefix=CHECK-THUMB2-SIZE
-
-target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
-
-define void @intrinsics() {
-; CHECK-THUMB2-RECIP-LABEL: 'intrinsics'
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t1 = call i32 @llvm.arm.ssat(i32 undef, i32 undef)
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t2 = tail call { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0(ptr undef)
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t3 = call { i32, i32 } @llvm.arm.mve.sqrshrl(i32 undef, i32 undef, i32 undef, i32 48)
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t4 = tail call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i16> undef, <8 x i16> undef)
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; CHECK-THUMB2-LAT-LABEL: 'intrinsics'
-; CHECK-THUMB2-LAT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t1 = call i32 @llvm.arm.ssat(i32 undef, i32 undef)
-; CHECK-THUMB2-LAT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t2 = tail call { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0(ptr undef)
-; CHECK-THUMB2-LAT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t3 = call { i32, i32 } @llvm.arm.mve.sqrshrl(i32 undef, i32 undef, i32 undef, i32 48)
-; CHECK-THUMB2-LAT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t4 = tail call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i16> undef, <8 x i16> undef)
-; CHECK-THUMB2-LAT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-THUMB2-SIZE-LABEL: 'intrinsics'
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t1 = call i32 @llvm.arm.ssat(i32 undef, i32 undef)
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t2 = tail call { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0(ptr undef)
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t3 = call { i32, i32 } @llvm.arm.mve.sqrshrl(i32 undef, i32 undef, i32 undef, i32 48)
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t4 = tail call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i16> undef, <8 x i16> undef)
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %t1 = call i32 @llvm.arm.ssat(i32 undef, i32 undef)
-  %t2 = tail call { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0(ptr undef)
-  %t3 = call { i32, i32 } @llvm.arm.mve.sqrshrl(i32 undef, i32 undef, i32 undef, i32 48)
-  %t4 = tail call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i16> undef, <8 x i16> undef)
-  ret void
-}
-
-declare i32 @llvm.arm.ssat(i32, i32)
-declare { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0(ptr)
-declare { i32, i32 } @llvm.arm.mve.sqrshrl(i32, i32, i32, i32)
-declare { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32, i32, i32, i32, i32, <8 x i16>, <8 x i16>)
diff --git a/llvm/test/Analysis/CostModel/SystemZ/intrinsic-cost-crash.ll b/llvm/test/Analysis/CostModel/SystemZ/intrinsic-cost-crash.ll
index 245e8f7..058370c 100644
--- a/llvm/test/Analysis/CostModel/SystemZ/intrinsic-cost-crash.ll
+++ b/llvm/test/Analysis/CostModel/SystemZ/intrinsic-cost-crash.ll
@@ -23,10 +23,10 @@
 %"class.llvm::Metadata.306.1758.9986.10470.10954.11438.11922.12406.12890.13374.13858.15310.15794.16278.17730.19182.21118.25958.26926.29346.29830.30314.30798.31282.31766.32250.32734.33702.36606.38058.41638" = type { i8, i8, i16, i32 }
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end(ptr nocapture) #0
+declare void @llvm.lifetime.end(ptr nocapture)
 
 ; Function Attrs: nounwind ssp uwtable
-define hidden void @fun(ptr %N, i1 %arg) #1 align 2 {
+define hidden void @fun(ptr %N, i1 %arg) align 2 {
 ; CHECK: define
 entry:
   %NumOperands.i = getelementptr inbounds %"class.llvm::SDNode.310.1762.9990.10474.10958.11442.11926.12410.12894.13378.13862.15314.15798.16282.17734.19186.21122.25962.26930.29350.29834.30318.30802.31286.31770.32254.32738.33706.36610.38062.41642", ptr %N, i64 0, i32 8
@@ -47,8 +47,6 @@ for.body:                                         ; preds = %for.body, %for.body
   br i1 %exitcond193, label %for.cond.cleanup, label %for.body
 }
 
-attributes #0 = { argmemonly nounwind }
-attributes #1 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/Analysis/Delinearization/constant_functions_multi_dim.ll b/llvm/test/Analysis/Delinearization/constant_functions_multi_dim.ll
index 0c0fb41..891d604 100644
--- a/llvm/test/Analysis/Delinearization/constant_functions_multi_dim.ll
+++ b/llvm/test/Analysis/Delinearization/constant_functions_multi_dim.ll
@@ -4,7 +4,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; Function Attrs: noinline nounwind uwtable
-define void @mat_mul(ptr %C, ptr %A, ptr %B, i64 %N) #0 !kernel_arg_addr_space !2 !kernel_arg_access_qual !3 !kernel_arg_type !4 !kernel_arg_base_type !4 !kernel_arg_type_qual !5 {
+define void @mat_mul(ptr %C, ptr %A, ptr %B, i64 %N) !kernel_arg_addr_space !2 !kernel_arg_access_qual !3 !kernel_arg_type !4 !kernel_arg_base_type !4 !kernel_arg_type_qual !5 {
 ; CHECK-LABEL: 'mat_mul'
 ; CHECK-NEXT:  Inst: %tmp = load float, ptr %arrayidx, align 4
 ; CHECK-NEXT:  AccessFunction: {(4 * %N * %call),+,4}<%for.inc>
@@ -22,8 +22,8 @@ entry:
   br label %entry.split
 
 entry.split:                                      ; preds = %entry
-  %call = tail call i64 @_Z13get_global_idj(i32 0) #3
-  %call1 = tail call i64 @_Z13get_global_idj(i32 1) #3
+  %call = tail call i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call i64 @_Z13get_global_idj(i32 1)
   %cmp1 = icmp sgt i64 %N, 0
   %mul = mul nsw i64 %call, %N
   br i1 %cmp1, label %for.inc.lr.ph, label %for.end
@@ -59,15 +59,10 @@ for.end:                                          ; preds = %for.cond.for.end_cr
 }
 
 ; Function Attrs: nounwind readnone
-declare i64 @_Z13get_global_idj(i32) #1
+declare i64 @_Z13get_global_idj(i32)
 
 ; Function Attrs: nounwind readnone speculatable
-declare float @llvm.fmuladd.f32(float, float, float) #2
-
-attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone speculatable }
-attributes #3 = { nounwind readnone }
+declare float @llvm.fmuladd.f32(float, float, float)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/Analysis/DependenceAnalysis/MIVCheckConst.ll b/llvm/test/Analysis/DependenceAnalysis/MIVCheckConst.ll
index b498d7064..f5be89a 100644
--- a/llvm/test/Analysis/DependenceAnalysis/MIVCheckConst.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/MIVCheckConst.ll
@@ -30,7 +30,7 @@ target datalayout = "e-m:e-p:32:32-i1:32-i64:64-a:0-v32:32-n16:32"
 %20 = type { [768 x i32] }
 %21 = type { [416 x i32] }
 
-define void @test(ptr %A, ptr %B, i1 %arg, i32 %n, i32 %m) #0 align 2 {
+define void @test(ptr %A, ptr %B, i1 %arg, i32 %n, i32 %m) align 2 {
 ; CHECK-LABEL: 'test'
 ; CHECK-NEXT:  Src: %v1 = load i32, ptr %B, align 4 --> Dst: %v1 = load i32, ptr %B, align 4
 ; CHECK-NEXT:    da analyze - none!
@@ -91,5 +91,3 @@ bb38:
 bb40:
   ret void
 }
-
-attributes #0 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Analysis/DependenceAnalysis/NonCanonicalizedSubscript.ll b/llvm/test/Analysis/DependenceAnalysis/NonCanonicalizedSubscript.ll
index e5d5d21e..eba017a 100644
--- a/llvm/test/Analysis/DependenceAnalysis/NonCanonicalizedSubscript.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/NonCanonicalizedSubscript.ll
@@ -52,7 +52,7 @@ for.end:
 @a = global [10004 x [10004 x i32]] zeroinitializer, align 16
 
 ; Function Attrs: nounwind uwtable
-define void @coupled_miv_type_mismatch(i32 %n) #0 {
+define void @coupled_miv_type_mismatch(i32 %n) {
 ; CHECK-LABEL: 'coupled_miv_type_mismatch'
 ; CHECK-NEXT:  Src: %2 = load i32, ptr %arrayidx5, align 4 --> Dst: %2 = load i32, ptr %arrayidx5, align 4
 ; CHECK-NEXT:    da analyze - none!
@@ -101,8 +101,6 @@ for.end13:                                        ; preds = %for.cond
   ret void
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.7.0"}
diff --git a/llvm/test/Analysis/DependenceAnalysis/compute-absolute-value.ll b/llvm/test/Analysis/DependenceAnalysis/compute-absolute-value.ll
new file mode 100644
index 0000000..64fad37
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/compute-absolute-value.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -disable-output "-passes=print<da>" 2>&1 | FileCheck %s
+
+; for (i = 0; i < 3; i++) {
+;     a[-k * i] = 1;
+;     a[-k * i + (2 * k + 1)] = 2;
+; }
+;
+; When k = -1, dependency exists between the two stores. Accesses will be:
+;
+;   - a[-k * i]               : a[ 0], a[-1], a[-2]
+;   - a[-k * i + (2 * k + 1)] : a[-1], a[-2], a[-3]
+;
+; We cannot determine the sign of `k` and `2*k + 1` at compile time,
+;
+define void @unknown_sign(ptr %a, i64 %k) {
+; CHECK-LABEL: 'unknown_sign'
+; CHECK-NEXT:  Src: store i8 1, ptr %idx.0, align 1 --> Dst: store i8 1, ptr %idx.0, align 1
+; CHECK-NEXT:    da analyze - none!
+; CHECK-NEXT:  Src: store i8 1, ptr %idx.0, align 1 --> Dst: store i8 2, ptr %idx.1, align 1
+; CHECK-NEXT:    da analyze - output [<>]!
+; CHECK-NEXT:  Src: store i8 2, ptr %idx.1, align 1 --> Dst: store i8 2, ptr %idx.1, align 1
+; CHECK-NEXT:    da analyze - none!
+;
+entry:
+  %k.neg = sub nsw i64 0, %k
+  %kk = mul nsw i64 %k, 2
+  %subscript.1.init = add i64 1, %kk
+  br label %loop
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %loop ]
+  %subscript.0 = phi i64 [ 0, %entry ], [ %subscript.0.next, %loop ]
+  %subscript.1 = phi i64 [ %subscript.1.init, %entry ], [ %subscript.1.next, %loop ]
+  %idx.0 = getelementptr i8, ptr %a, i64 %subscript.0
+  %idx.1 = getelementptr i8, ptr %a, i64 %subscript.1
+  store i8 1, ptr %idx.0
+  store i8 2, ptr %idx.1
+  %i.next = add i64 %i, 1
+  %subscript.0.next = add nsw i64 %subscript.0, %k.neg
+  %subscript.1.next = add nsw i64 %subscript.1, %k.neg
+  %cond.exit = icmp eq i64 %i.next, 3
+  br i1 %cond.exit, label %exit, label %loop
+
+exit:
+  ret void
+}
+
diff --git a/llvm/test/Analysis/MemoryDependenceAnalysis/invariant.group-bug.ll b/llvm/test/Analysis/MemoryDependenceAnalysis/invariant.group-bug.ll
index c11191e..5470ef9 100644
--- a/llvm/test/Analysis/MemoryDependenceAnalysis/invariant.group-bug.ll
+++ b/llvm/test/Analysis/MemoryDependenceAnalysis/invariant.group-bug.ll
@@ -17,13 +17,13 @@ target triple = "x86_64-grtev4-linux-gnu"
 %4 = type { ptr }
 %5 = type { i64, [8 x i8] }
 
-define void @fail(ptr noalias sret(i1) %arg, ptr %arg1, ptr %arg2, ptr %arg3, i1 %arg4) local_unnamed_addr #0 {
+define void @fail(ptr noalias sret(i1) %arg, ptr %arg1, ptr %arg2, ptr %arg3, i1 %arg4) local_unnamed_addr {
 ; CHECK-LABEL: @fail(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[I4:%.*]] = load ptr, ptr [[ARG1:%.*]], align 8, !invariant.group [[META6:![0-9]+]]
 ; CHECK-NEXT:    [[I5:%.*]] = getelementptr inbounds ptr, ptr [[I4]], i64 6
 ; CHECK-NEXT:    [[I6:%.*]] = load ptr, ptr [[I5]], align 8, !invariant.load [[META6]]
-; CHECK-NEXT:    [[I7:%.*]] = tail call i64 [[I6]](ptr [[ARG1]]) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    [[I7:%.*]] = tail call i64 [[I6]](ptr [[ARG1]])
 ; CHECK-NEXT:    [[I9:%.*]] = load ptr, ptr [[ARG2:%.*]], align 8
 ; CHECK-NEXT:    store i8 0, ptr [[I9]], align 1
 ; CHECK-NEXT:    br i1 [[ARG4:%.*]], label [[BB10:%.*]], label [[BB29:%.*]]
@@ -32,7 +32,7 @@ define void @fail(ptr noalias sret(i1) %arg, ptr %arg1, ptr %arg2, ptr %arg3, i1
 ; CHECK-NEXT:    [[I15_PRE:%.*]] = load ptr, ptr [[I14_PHI_TRANS_INSERT]], align 8, !invariant.load [[META6]]
 ; CHECK-NEXT:    br label [[BB12:%.*]]
 ; CHECK:       bb12:
-; CHECK-NEXT:    [[I16:%.*]] = call i64 [[I15_PRE]](ptr nonnull [[ARG1]], ptr null, i64 0) #[[ATTR1]]
+; CHECK-NEXT:    [[I16:%.*]] = call i64 [[I15_PRE]](ptr nonnull [[ARG1]], ptr null, i64 0)
 ; CHECK-NEXT:    br i1 true, label [[BB28:%.*]], label [[BB17:%.*]]
 ; CHECK:       bb17:
 ; CHECK-NEXT:    br i1 true, label [[BB18:%.*]], label [[BB21:%.*]]
@@ -55,7 +55,7 @@ bb:
   %i4 = load ptr, ptr %arg1, align 8, !invariant.group !6
   %i5 = getelementptr inbounds ptr, ptr %i4, i64 6
   %i6 = load ptr, ptr %i5, align 8, !invariant.load !6
-  %i7 = tail call i64 %i6(ptr %arg1) #1
+  %i7 = tail call i64 %i6(ptr %arg1)
   %i9 = load ptr, ptr %arg2, align 8
   store i8 0, ptr %i9, align 1
   br i1 %arg4, label %bb10, label %bb29
@@ -67,7 +67,7 @@ bb12:                                             ; preds = %bb28, %bb10
   %i13 = load ptr, ptr %arg1, align 8, !invariant.group !6
   %i14 = getelementptr inbounds ptr, ptr %i13, i64 22
   %i15 = load ptr, ptr %i14, align 8, !invariant.load !6
-  %i16 = call i64 %i15(ptr nonnull %arg1, ptr null, i64 0) #1
+  %i16 = call i64 %i15(ptr nonnull %arg1, ptr null, i64 0)
   br i1 %arg4, label %bb28, label %bb17
 
 bb17:                                             ; preds = %bb12
@@ -110,9 +110,6 @@ bb29:                                             ; preds = %bb28, %bb
   ret void
 }
 
-attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.linker.options = !{}
 !llvm.module.flags = !{!0, !1, !3, !4, !5}
 
diff --git a/llvm/test/Analysis/MemorySSA/pr28880.ll b/llvm/test/Analysis/MemorySSA/pr28880.ll
index 98f3261..a2690b9 100644
--- a/llvm/test/Analysis/MemorySSA/pr28880.ll
+++ b/llvm/test/Analysis/MemorySSA/pr28880.ll
@@ -8,7 +8,7 @@
 @global.1 = external hidden unnamed_addr global double, align 8
 
 ; Function Attrs: nounwind ssp uwtable
-define hidden fastcc void @hoge(i1 %arg) unnamed_addr #0 {
+define hidden fastcc void @hoge(i1 %arg) unnamed_addr {
 bb:
   br i1 %arg, label %bb1, label %bb2
 
@@ -45,6 +45,3 @@ bb4:                                              ; preds = %bb3
 bb6:                                              ; preds = %bb3
   unreachable
 }
-
-attributes #0 = { nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
diff --git a/llvm/test/Analysis/MemorySSA/pr39197.ll b/llvm/test/Analysis/MemorySSA/pr39197.ll
index af57b3c..6be0c58 100644
--- a/llvm/test/Analysis/MemorySSA/pr39197.ll
+++ b/llvm/test/Analysis/MemorySSA/pr39197.ll
@@ -12,13 +12,13 @@ declare void @dummy()
 
 ; CHECK-LABEL: @main()
 ; Function Attrs: nounwind
-define dso_local void @main() #0 {
+define dso_local void @main() {
   call void @func_1()
   unreachable
 }
 
 ; Function Attrs: nounwind
-define dso_local void @func_1() #0 {
+define dso_local void @func_1() {
   %1 = alloca ptr, align 8
   %2 = call signext i32 @func_2()
   %3 = icmp ne i32 %2, 0
@@ -64,45 +64,45 @@ define dso_local void @func_1() #0 {
 }
 
 ; Function Attrs: nounwind
-declare dso_local signext i32 @func_2() #0
+declare dso_local signext i32 @func_2()
 
 ; Function Attrs: nounwind
-define dso_local void @safe_sub_func_uint8_t_u_u() #0 {
+define dso_local void @safe_sub_func_uint8_t_u_u() {
   ret void
 }
 
 ; Function Attrs: nounwind
-define dso_local void @safe_add_func_int64_t_s_s() #0 {
+define dso_local void @safe_add_func_int64_t_s_s() {
   ret void
 }
 
 ; Function Attrs: nounwind
-define dso_local void @safe_rshift_func_int16_t_s_u() #0 {
+define dso_local void @safe_rshift_func_int16_t_s_u() {
   ret void
 }
 
 ; Function Attrs: nounwind
-define dso_local void @safe_div_func_uint8_t_u_u() #0 {
+define dso_local void @safe_div_func_uint8_t_u_u() {
   ret void
 }
 
 ; Function Attrs: nounwind
-define dso_local void @safe_mul_func_uint16_t_u_u() #0 {
+define dso_local void @safe_mul_func_uint16_t_u_u() {
   ret void
 }
 
 ; Function Attrs: nounwind
-define dso_local void @safe_mul_func_int16_t_s_s() #0 {
+define dso_local void @safe_mul_func_int16_t_s_s() {
   ret void
 }
 
 ; Function Attrs: nounwind
-define dso_local void @safe_div_func_int32_t_s_s() #0 {
+define dso_local void @safe_div_func_int32_t_s_s() {
   ret void
 }
 
 ; Function Attrs: nounwind
-define dso_local signext i16 @safe_sub_func_int16_t_s_s(i16 signext) #0 {
+define dso_local signext i16 @safe_sub_func_int16_t_s_s(i16 signext) {
   %2 = alloca i16, align 2
   store i16 %0, ptr %2, align 2, !tbaa !1
   %3 = load i16, ptr %2, align 2, !tbaa !1
@@ -113,29 +113,25 @@ define dso_local signext i16 @safe_sub_func_int16_t_s_s(i16 signext) #0 {
 }
 
 ; Function Attrs: nounwind
-define dso_local void @safe_add_func_uint16_t_u_u() #0 {
+define dso_local void @safe_add_func_uint16_t_u_u() {
   ret void
 }
 
 ; Function Attrs: nounwind
-define dso_local void @safe_div_func_int8_t_s_s() #0 {
+define dso_local void @safe_div_func_int8_t_s_s() {
   ret void
 }
 
 ; Function Attrs: nounwind
-define dso_local void @safe_add_func_int16_t_s_s() #0 {
+define dso_local void @safe_add_func_int16_t_s_s() {
   ret void
 }
 
 ; Function Attrs: nounwind
-define dso_local void @safe_add_func_uint8_t_u_u() #0 {
+define dso_local void @safe_add_func_uint8_t_u_u() {
   ret void
 }
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="z13" "target-features"="+transactional-execution,+vector" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 8.0.0 (http://llvm.org/git/clang.git 7cda4756fc9713d98fd3513b8df172700f267bad) (http://llvm.org/git/llvm.git 199c0d32e96b646bd8cf6beeaf0f99f8a434b56a)"}
diff --git a/llvm/test/Analysis/MemorySSA/pr40038.ll b/llvm/test/Analysis/MemorySSA/pr40038.ll
index efdcbe5..39ea78b 100644
--- a/llvm/test/Analysis/MemorySSA/pr40038.ll
+++ b/llvm/test/Analysis/MemorySSA/pr40038.ll
@@ -10,21 +10,21 @@ target triple = "s390x-ibm-linux"
 
 ; Function Attrs: nounwind
 ; CHECK-LABEL: @main
-define dso_local void @main() #0 {
+define dso_local void @main() {
 bb:
   call void @func_1()
   unreachable
 }
 
 ; Function Attrs: nounwind
-define dso_local void @func_1() #0 {
+define dso_local void @func_1() {
 bb:
   call void @func_2()
   unreachable
 }
 
 ; Function Attrs: nounwind
-define dso_local void @func_2() #0 {
+define dso_local void @func_2() {
 bb:
   %tmp = alloca i32, align 4
   store i32 0, ptr @g_80, align 4, !tbaa !1
@@ -68,10 +68,7 @@ bb18:                                             ; preds = %bb12, %bb1
 }
 
 ; Function Attrs: cold noreturn nounwind
-declare void @llvm.trap() #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="z13" "target-features"="+transactional-execution,+vector" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { cold noreturn nounwind }
+declare void @llvm.trap()
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/Analysis/MemorySSA/pr43569.ll b/llvm/test/Analysis/MemorySSA/pr43569.ll
index 02d074e..c81f8d4 100644
--- a/llvm/test/Analysis/MemorySSA/pr43569.ll
+++ b/llvm/test/Analysis/MemorySSA/pr43569.ll
@@ -10,7 +10,7 @@ target triple = "x86_64-unknown-linux-gnu"
 
 ; CHECK-LABEL: @c()
 ; Function Attrs: nounwind uwtable
-define dso_local void @c() #0 {
+define dso_local void @c() {
 entry:
   call void @llvm.instrprof.increment(ptr @__profn_c, i64 68269137, i32 3, i32 0)
   br label %for.cond
@@ -42,8 +42,4 @@ for.end:                                          ; preds = %for.cond1
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.instrprof.increment(ptr, i64, i32, i32) #1
-
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind }
-
+declare void @llvm.instrprof.increment(ptr, i64, i32, i32)
diff --git a/llvm/test/Analysis/ScalarEvolution/pr22674.ll b/llvm/test/Analysis/ScalarEvolution/pr22674.ll
index 95f96ca..b2f4ae6 100644
--- a/llvm/test/Analysis/ScalarEvolution/pr22674.ll
+++ b/llvm/test/Analysis/ScalarEvolution/pr22674.ll
@@ -11,7 +11,7 @@ target triple = "x86_64-pc-linux-gnux32"
 %"class.llvm::AttributeImpl.2.1802.3601.5914.6685.7456.8227.9255.9769.10026.18508" = type <{ ptr, %"class.llvm::FoldingSetImpl::Node.1.1801.3600.5913.6684.7455.8226.9254.9768.10025.18505", i8, [3 x i8] }>
 
 ; Function Attrs: nounwind uwtable
-define void @_ZNK4llvm11AttrBuilder13hasAttributesENS_12AttributeSetEy(i1 %arg) #0 align 2 {
+define void @_ZNK4llvm11AttrBuilder13hasAttributesENS_12AttributeSetEy(i1 %arg) align 2 {
 entry:
   br i1 %arg, label %cond.false, label %_ZNK4llvm12AttributeSet11getNumSlotsEv.exit
 
@@ -82,8 +82,6 @@ return:                                           ; preds = %_ZNK4llvm9Attribute
   ret void
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
 
diff --git a/llvm/test/Analysis/ScalarEvolution/scev-canonical-mode.ll b/llvm/test/Analysis/ScalarEvolution/scev-canonical-mode.ll
index 3879b2e7..d9cc3e5 100644
--- a/llvm/test/Analysis/ScalarEvolution/scev-canonical-mode.ll
+++ b/llvm/test/Analysis/ScalarEvolution/scev-canonical-mode.ll
@@ -6,7 +6,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 ; Function Attrs: norecurse nounwind uwtable
-define void @ehF(i1 %arg) #0 {
+define void @ehF(i1 %arg) {
 entry:
   br i1 %arg, label %if.then.i, label %hup.exit
 
@@ -28,5 +28,3 @@ for.body.i:                                       ; preds = %for.body.i, %for.bo
 hup.exit:                                         ; preds = %for.body.i, %if.then.i, %entry
   ret void
 }
-
-attributes #0 = { norecurse nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Analysis/TypeBasedAliasAnalysis/PR17620.ll b/llvm/test/Analysis/TypeBasedAliasAnalysis/PR17620.ll
index 67d81e7..7fb4231 100644
--- a/llvm/test/Analysis/TypeBasedAliasAnalysis/PR17620.ll
+++ b/llvm/test/Analysis/TypeBasedAliasAnalysis/PR17620.ll
@@ -13,7 +13,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 %classD = type { ptr }
 
 ; Function Attrs: ssp uwtable
-define ptr @test(ptr %this, ptr %p1) #0 align 2 {
+define ptr @test(ptr %this, ptr %p1) align 2 {
 entry:
 ; CHECK-LABEL: @test
 ; CHECK: load ptr, ptr %p1, align 8, !tbaa
@@ -25,10 +25,7 @@ entry:
   unreachable
 }
 
-declare void @callee(ptr, ptr) #1
-
-attributes #0 = { ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+declare void @callee(ptr, ptr)
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/Analysis/TypeBasedAliasAnalysis/tbaa-path.ll b/llvm/test/Analysis/TypeBasedAliasAnalysis/tbaa-path.ll
index 942fdf5..f9a2988 100644
--- a/llvm/test/Analysis/TypeBasedAliasAnalysis/tbaa-path.ll
+++ b/llvm/test/Analysis/TypeBasedAliasAnalysis/tbaa-path.ll
@@ -9,7 +9,7 @@
 %struct.StructC = type { i16, %struct.StructB, i32 }
 %struct.StructD = type { i16, %struct.StructB, i32, i8 }
 
-define i32 @_Z1gPjP7StructAy(ptr %s, ptr %A, i64 %count) #0 {
+define i32 @_Z1gPjP7StructAy(ptr %s, ptr %A, i64 %count) {
 entry:
 ; Access to ptr and &(A->f32).
 ; CHECK: Function
@@ -35,7 +35,7 @@ entry:
   ret i32 %3
 }
 
-define i32 @_Z2g2PjP7StructAy(ptr %s, ptr %A, i64 %count) #0 {
+define i32 @_Z2g2PjP7StructAy(ptr %s, ptr %A, i64 %count) {
 entry:
 ; Access to ptr and &(A->f16).
 ; CHECK: Function
@@ -60,7 +60,7 @@ entry:
   ret i32 %3
 }
 
-define i32 @_Z2g3P7StructAP7StructBy(ptr %A, ptr %B, i64 %count) #0 {
+define i32 @_Z2g3P7StructAP7StructBy(ptr %A, ptr %B, i64 %count) {
 entry:
 ; Access to &(A->f32) and &(B->a.f32).
 ; CHECK: Function
@@ -89,7 +89,7 @@ entry:
   ret i32 %3
 }
 
-define i32 @_Z2g4P7StructAP7StructBy(ptr %A, ptr %B, i64 %count) #0 {
+define i32 @_Z2g4P7StructAP7StructBy(ptr %A, ptr %B, i64 %count) {
 entry:
 ; Access to &(A->f32) and &(B->a.f16).
 ; CHECK: Function
@@ -117,7 +117,7 @@ entry:
   ret i32 %3
 }
 
-define i32 @_Z2g5P7StructAP7StructBy(ptr %A, ptr %B, i64 %count) #0 {
+define i32 @_Z2g5P7StructAP7StructBy(ptr %A, ptr %B, i64 %count) {
 entry:
 ; Access to &(A->f32) and &(B->f32).
 ; CHECK: Function
@@ -145,7 +145,7 @@ entry:
   ret i32 %3
 }
 
-define i32 @_Z2g6P7StructAP7StructBy(ptr %A, ptr %B, i64 %count) #0 {
+define i32 @_Z2g6P7StructAP7StructBy(ptr %A, ptr %B, i64 %count) {
 entry:
 ; Access to &(A->f32) and &(B->a.f32_2).
 ; CHECK: Function
@@ -174,7 +174,7 @@ entry:
   ret i32 %3
 }
 
-define i32 @_Z2g7P7StructAP7StructSy(ptr %A, ptr %S, i64 %count) #0 {
+define i32 @_Z2g7P7StructAP7StructSy(ptr %A, ptr %S, i64 %count) {
 entry:
 ; Access to &(A->f32) and &(S->f32).
 ; CHECK: Function
@@ -202,7 +202,7 @@ entry:
   ret i32 %3
 }
 
-define i32 @_Z2g8P7StructAP7StructSy(ptr %A, ptr %S, i64 %count) #0 {
+define i32 @_Z2g8P7StructAP7StructSy(ptr %A, ptr %S, i64 %count) {
 entry:
 ; Access to &(A->f32) and &(S->f16).
 ; CHECK: Function
@@ -229,7 +229,7 @@ entry:
   ret i32 %3
 }
 
-define i32 @_Z2g9P7StructSP8StructS2y(ptr %S, ptr %S2, i64 %count) #0 {
+define i32 @_Z2g9P7StructSP8StructS2y(ptr %S, ptr %S2, i64 %count) {
 entry:
 ; Access to &(S->f32) and &(S2->f32).
 ; CHECK: Function
@@ -257,7 +257,7 @@ entry:
   ret i32 %3
 }
 
-define i32 @_Z3g10P7StructSP8StructS2y(ptr %S, ptr %S2, i64 %count) #0 {
+define i32 @_Z3g10P7StructSP8StructS2y(ptr %S, ptr %S2, i64 %count) {
 entry:
 ; Access to &(S->f32) and &(S2->f16).
 ; CHECK: Function
@@ -284,7 +284,7 @@ entry:
   ret i32 %3
 }
 
-define i32 @_Z3g11P7StructCP7StructDy(ptr %C, ptr %D, i64 %count) #0 {
+define i32 @_Z3g11P7StructCP7StructDy(ptr %C, ptr %D, i64 %count) {
 entry:
 ; Access to &(C->b.a.f32) and &(D->b.a.f32).
 ; CHECK: Function
@@ -318,7 +318,7 @@ entry:
   ret i32 %3
 }
 
-define i32 @_Z3g12P7StructCP7StructDy(ptr %C, ptr %D, i64 %count) #0 {
+define i32 @_Z3g12P7StructCP7StructDy(ptr %C, ptr %D, i64 %count) {
 entry:
 ; Access to &(b1->a.f32) and &(b2->a.f32).
 ; CHECK: Function
@@ -357,8 +357,6 @@ entry:
   ret i32 %5
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !0 = !{!1, !1, i64 0}
 !1 = !{!"any pointer", !2}
 !2 = !{!"omnipotent char", !3}
diff --git a/llvm/test/Assembler/atomic.ll b/llvm/test/Assembler/atomic.ll
index 39f33f9f..6609edc 100644
--- a/llvm/test/Assembler/atomic.ll
+++ b/llvm/test/Assembler/atomic.ll
@@ -52,6 +52,25 @@ define void @f(ptr %x) {
   ; CHECK: atomicrmw volatile usub_sat ptr %x, i32 10 syncscope("agent") monotonic
   atomicrmw volatile usub_sat ptr %x, i32 10 syncscope("agent") monotonic
 
+  ; CHECK : load atomic <1 x i32>, ptr %x unordered, align 4
+  load atomic <1 x i32>, ptr %x unordered, align 4
+  ; CHECK : store atomic <1 x i32> splat (i32 3), ptr %x release, align 4
+  store atomic <1 x i32> <i32 3>, ptr %x release, align 4
+  ; CHECK : load atomic <2 x i32>, ptr %x unordered, align 4
+  load atomic <2 x i32>, ptr %x unordered, align 4
+  ; CHECK : store atomic <2 x i32> <i32 3, i32 4>, ptr %x release, align 4
+  store atomic <2 x i32> <i32 3, i32 4>, ptr %x release, align 4
+
+  ; CHECK : load atomic <2 x ptr>, ptr %x unordered, align 4
+  load atomic <2 x ptr>, ptr %x unordered, align 4
+  ; CHECK : store atomic <2 x ptr> zeroinitializer, ptr %x release, align 4
+  store atomic <2 x ptr> zeroinitializer, ptr %x release, align 4
+
+  ; CHECK : load atomic <2 x float>, ptr %x unordered, align 4
+  load atomic <2 x float>, ptr %x unordered, align 4
+  ; CHECK : store atomic <2 x float> <float 3.0, float 4.0>, ptr %x release, align 4
+  store atomic <2 x float> <float 3.0, float 4.0>, ptr %x release, align 4
+
   ; CHECK: fence syncscope("singlethread") release
   fence syncscope("singlethread") release
   ; CHECK: fence seq_cst
diff --git a/llvm/test/Assembler/metadata-annotations.ll b/llvm/test/Assembler/metadata-annotations.ll
new file mode 100644
index 0000000..4fd4713
--- /dev/null
+++ b/llvm/test/Assembler/metadata-annotations.ll
@@ -0,0 +1,9 @@
+; RUN: llvm-as < %s | llvm-dis --materialize-metadata --show-annotations | FileCheck %s
+
+; CHECK: ; Materializable
+; CHECK-NEXT: define dso_local i32 @test() {}
+define dso_local i32 @test() {
+entry:
+  ret i32 0
+}
+
diff --git a/llvm/test/Bitcode/DILocation-implicit-code.ll b/llvm/test/Bitcode/DILocation-implicit-code.ll
index 159cb8a..1090bff 100644
--- a/llvm/test/Bitcode/DILocation-implicit-code.ll
+++ b/llvm/test/Bitcode/DILocation-implicit-code.ll
@@ -13,7 +13,7 @@ $_ZN1A3fooEi = comdat any
 @_ZTIi = external dso_local constant i8*
 
 ; Function Attrs: noinline optnone uwtable
-define dso_local void @_Z5test1v() #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) !dbg !7 {
+define dso_local void @_Z5test1v() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) !dbg !7 {
 entry:
   %retval = alloca %struct.A, align 1
   %a = alloca %struct.A, align 1
@@ -40,13 +40,13 @@ lpad:                                             ; preds = %entry
 
 catch.dispatch:                                   ; preds = %lpad
   %sel = load i32, i32* %ehselector.slot, align 4, !dbg !13
-  %3 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*)) #4, !dbg !13
+  %3 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*)), !dbg !13
   %matches = icmp eq i32 %sel, %3, !dbg !13
   br i1 %matches, label %catch, label %eh.resume, !dbg !13
 
 catch:                                            ; preds = %catch.dispatch
   %exn = load i8*, i8** %exn.slot, align 8, !dbg !13
-  %4 = call i8* @__cxa_begin_catch(i8* %exn) #4, !dbg !13
+  %4 = call i8* @__cxa_begin_catch(i8* %exn), !dbg !13
   %5 = bitcast i8* %4 to i32*, !dbg !13
   %6 = load i32, i32* %5, align 4, !dbg !13
   store i32 %6, i32* %e, align 4, !dbg !13
@@ -55,7 +55,7 @@ catch:                                            ; preds = %catch.dispatch
           to label %invoke.cont2 unwind label %lpad1, !dbg !15
 
 invoke.cont2:                                     ; preds = %catch
-  call void @__cxa_end_catch() #4, !dbg !16
+  call void @__cxa_end_catch(), !dbg !16
   br label %return
 
 lpad1:                                            ; preds = %catch
@@ -65,7 +65,7 @@ lpad1:                                            ; preds = %catch
   store i8* %9, i8** %exn.slot, align 8, !dbg !12
   %10 = extractvalue { i8*, i32 } %8, 1, !dbg !12
   store i32 %10, i32* %ehselector.slot, align 4, !dbg !12
-  call void @__cxa_end_catch() #4, !dbg !16
+  call void @__cxa_end_catch(), !dbg !16
   br label %eh.resume, !dbg !16
 
 try.cont:                                         ; No predecessors!
@@ -84,7 +84,7 @@ eh.resume:                                        ; preds = %lpad1, %catch.dispa
 }
 
 ; Function Attrs: noinline nounwind optnone uwtable
-define linkonce_odr dso_local void @_ZN1AC2Ev(%struct.A* %this) unnamed_addr #1 comdat align 2 !dbg !17 {
+define linkonce_odr dso_local void @_ZN1AC2Ev(%struct.A* %this) unnamed_addr comdat align 2 !dbg !17 {
 entry:
   %this.addr = alloca %struct.A*, align 8
   store %struct.A* %this, %struct.A** %this.addr, align 8
@@ -93,7 +93,7 @@ entry:
 }
 
 ; Function Attrs: noinline optnone uwtable
-define linkonce_odr dso_local void @_ZN1A3fooEi(%struct.A* %this, i32 %i) #0 comdat align 2 !dbg !19 {
+define linkonce_odr dso_local void @_ZN1A3fooEi(%struct.A* %this, i32 %i) comdat align 2 !dbg !19 {
 entry:
   %retval = alloca %struct.A, align 1
   %this.addr = alloca %struct.A*, align 8
@@ -106,10 +106,10 @@ entry:
   br i1 %cmp, label %if.then, label %if.end, !dbg !20
 
 if.then:                                          ; preds = %entry
-  %exception = call i8* @__cxa_allocate_exception(i64 4) #4, !dbg !22
+  %exception = call i8* @__cxa_allocate_exception(i64 4), !dbg !22
   %1 = bitcast i8* %exception to i32*, !dbg !22
   store i32 1, i32* %1, align 16, !dbg !22
-  call void @__cxa_throw(i8* %exception, i8* bitcast (i8** @_ZTIi to i8*), i8* null) #5, !dbg !22
+  call void @__cxa_throw(i8* %exception, i8* bitcast (i8** @_ZTIi to i8*), i8* null), !dbg !22
   unreachable, !dbg !22
 
 if.end:                                           ; preds = %entry
@@ -119,17 +119,17 @@ if.end:                                           ; preds = %entry
 declare dso_local i32 @__gxx_personality_v0(...)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.eh.typeid.for(i8*) #2
+declare i32 @llvm.eh.typeid.for(i8*)
 
 declare dso_local i8* @__cxa_begin_catch(i8*)
 
 declare dso_local void @__cxa_end_catch()
 
 ; Function Attrs: noreturn nounwind
-declare void @llvm.trap() #3
+declare void @llvm.trap()
 
 ; Function Attrs: noinline optnone uwtable
-define dso_local void @_Z5test2v() #0 !dbg !24 {
+define dso_local void @_Z5test2v() !dbg !24 {
 entry:
   %a = alloca %struct.A, align 1
   %b = alloca %struct.A, align 1
@@ -143,13 +143,6 @@ declare dso_local i8* @__cxa_allocate_exception(i64)
 
 declare dso_local void @__cxa_throw(i8*, i8*, i8*)
 
-attributes #0 = { noinline optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { noreturn nounwind }
-attributes #4 = { nounwind }
-attributes #5 = { noreturn }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
diff --git a/llvm/test/Bitcode/drop-debug-info.3.5.ll b/llvm/test/Bitcode/drop-debug-info.3.5.ll
index 35d3958..16102a0 100644
--- a/llvm/test/Bitcode/drop-debug-info.3.5.ll
+++ b/llvm/test/Bitcode/drop-debug-info.3.5.ll
@@ -12,15 +12,13 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.10.0"
 
 ; Function Attrs: nounwind ssp uwtable
-define i32 @main() #0 {
+define i32 @main() {
 entry:
   %retval = alloca i32, align 4
   store i32 0, i32* %retval
   ret i32 0, !dbg !12
 }
 
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10}
 !llvm.ident = !{!11}
diff --git a/llvm/test/Bitcode/thinlto-deadstrip-flag.ll b/llvm/test/Bitcode/thinlto-deadstrip-flag.ll
deleted file mode 100644
index 00c0131..0000000
--- a/llvm/test/Bitcode/thinlto-deadstrip-flag.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; REQUIRES: x86-registered-target
-; RUN: opt -module-summary %s -o %t.o
-
-; Ensure dead stripping performed flag is set on distributed index
-; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \
-; RUN:		-r %t.o,glob,plx
-; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=WITHDEAD
-; WITHDEAD: <FLAGS op0=97/>
-
-; Ensure dead stripping performed flag is not set on distributed index
-; when option used to disable dead stripping computation.
-; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \
-; RUN:		-r %t.o,glob,plx -compute-dead=false
-; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=NODEAD
-; NODEAD: <FLAGS op0=96/>
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@glob = global i32 0
diff --git a/llvm/test/Bitcode/thinlto-index-flags.ll b/llvm/test/Bitcode/thinlto-index-flags.ll
new file mode 100644
index 0000000..e957ce6
--- /dev/null
+++ b/llvm/test/Bitcode/thinlto-index-flags.ll
@@ -0,0 +1,39 @@
+; REQUIRES: x86-registered-target
+; RUN: opt -module-summary %s -o %t.o
+
+;; By default, the indexing step should perform and set the appropriate index
+;; flags for dead stripping, attribute propagation, DSO local propagation,
+;; and internalization/promotion.
+; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \
+; RUN:		-r %t.o,glob,plx
+; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=ALL
+;; The flag value should be 0x461 aka 1121:
+;; 0x1: Dead stripping
+;; 0x20: Attribute propagation
+;; 0x40: DSO local propagation
+;; 0x400: Internalization/promotion
+; ALL: <FLAGS op0=1121/>
+
+;; Ensure dead stripping performed flag is not set on distributed index
+;; when option used to disable dead stripping computation.
+; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \
+; RUN:		-r %t.o,glob,plx -compute-dead=false
+; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=NODEAD
+;; Flag should be 0x460 aka 1120.
+; NODEAD: <FLAGS op0=1120/>
+
+;; Disabling attribute propagation should disable that as well as DSO local
+;; propagation.
+; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \
+; RUN:		-r %t.o,glob,plx -propagate-attrs=false
+; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=NOPROP
+;; Flag should be 0x401 aka 1025.
+; NOPROP: <FLAGS op0=1025/>
+
+;; Note there isn't currently a way to disable internalization+promotion, which
+;; are performed together.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@glob = global i32 0
diff --git a/llvm/test/Bitcode/upgrade-tbaa.ll b/llvm/test/Bitcode/upgrade-tbaa.ll
index 7a70d99..893ba69 100644
--- a/llvm/test/Bitcode/upgrade-tbaa.ll
+++ b/llvm/test/Bitcode/upgrade-tbaa.ll
@@ -2,7 +2,7 @@
 ; RUN: verify-uselistorder < %s
 
 ; Function Attrs: nounwind
-define void @_Z4testPiPf(i32* nocapture %pI, float* nocapture %pF) #0 {
+define void @_Z4testPiPf(i32* nocapture %pI, float* nocapture %pF) {
 entry:
   store i32 0, i32* %pI, align 4, !tbaa !{!"int", !0}
   ; CHECK: store i32 0, ptr %pI, align 4, !tbaa [[TAG_INT:!.*]]
@@ -11,8 +11,6 @@ entry:
   ret void
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !0 = !{!"omnipotent char", !1}
 !1 = !{!"Simple C/C++ TBAA"}
 !2 = !{!"float", !0}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
index 0f75887..c68ae92 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
@@ -1734,8 +1734,7 @@ define float @test_different_call_conv_target(float %x) {
 define <2 x i32> @test_shufflevector_s32_v2s32(i32 %arg) {
 ; CHECK-LABEL: name: test_shufflevector_s32_v2s32
 ; CHECK: [[ARG:%[0-9]+]]:_(s32) = COPY $w0
-; CHECK-DAG: [[UNDEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_SHUFFLE_VECTOR [[ARG]](s32), [[UNDEF]], shufflemask(0, 0)
+; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ARG]](s32), [[ARG]](s32)
 ; CHECK: $d0 = COPY [[VEC]](<2 x s32>)
   %vec = insertelement <1 x i32> undef, i32 %arg, i32 0
   %res = shufflevector <1 x i32> %vec, <1 x i32> undef, <2 x i32> zeroinitializer
@@ -1745,7 +1744,8 @@ define <2 x i32> @test_shufflevector_s32_v2s32(i32 %arg) {
 define i32 @test_shufflevector_v2s32_s32(<2 x i32> %arg) {
 ; CHECK-LABEL: name: test_shufflevector_v2s32_s32
 ; CHECK: [[ARG:%[0-9]+]]:_(<2 x s32>) = COPY $d0
-; CHECK: [[RES:%[0-9]+]]:_(s32) = G_SHUFFLE_VECTOR [[ARG]](<2 x s32>), [[UNDEF]], shufflemask(1)
+; CHECK: [[IDX:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+; CHECK: [[RES:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[ARG]](<2 x s32>), [[IDX]](s64)
 ; CHECK: $w0 = COPY [[RES]](s32)
   %vec = shufflevector <2 x i32> %arg, <2 x i32> undef, <1 x i32> <i32 1>
   %res = extractelement <1 x i32> %vec, i32 0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir
index 2e70252..fdd0ebb 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir
@@ -119,33 +119,6 @@ body:             |
 
 ...
 ---
-name:            shuffle_1elt_mask
-alignment:       4
-tracksRegLiveness: true
-body:             |
-  bb.1:
-    liveins: $d0, $d1
-
-    ; CHECK-LABEL: name: shuffle_1elt_mask
-    ; CHECK: liveins: $d0, $d1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $d1
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY]](s64)
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64)
-    ; CHECK-NEXT: $d0 = COPY [[COPY2]](s64)
-    ; CHECK-NEXT: $d1 = COPY [[COPY3]](s64)
-    ; CHECK-NEXT: RET_ReallyLR implicit $d0, implicit $d1
-    %0:_(s64) = COPY $d0
-    %1:_(s64) = COPY $d1
-    %3:_(s64) = G_SHUFFLE_VECTOR %0:_(s64), %1:_, shufflemask(0)
-    %4:_(s64) = G_SHUFFLE_VECTOR %0:_(s64), %1:_, shufflemask(1)
-    $d0 = COPY %3(s64)
-    $d1 = COPY %4(s64)
-    RET_ReallyLR implicit $d0, implicit $d1
-
-...
----
 name:            oversize_shuffle_v4i64
 alignment:       4
 tracksRegLiveness: true
@@ -641,7 +614,8 @@ body:             |
     %0:_(<8 x s1>) = G_TRUNC %2:_(<8 x s8>)
     %3:_(<8 x s8>) = COPY $d1
     %1:_(<8 x s1>) = G_TRUNC %3:_(<8 x s8>)
-    %4:_(s1) = G_SHUFFLE_VECTOR %0:_(<8 x s1>), %1:_, shufflemask(12)
+    %7:_(s64) = G_CONSTANT i64 4
+    %4:_(s1) = G_EXTRACT_VECTOR_ELT %1:_(<8 x s1>), %7(s64)
     %5:_(s8) = G_ZEXT %4:_(s1)
     %6:_(s32) = G_ANYEXT %5:_(s8)
     $w0 = COPY %6:_(s32)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-disjoint-mask.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-disjoint-mask.mir
index 9261d7a..914dfa0 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-disjoint-mask.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-disjoint-mask.mir
@@ -80,22 +80,3 @@ body:             |
     %2:_(<4 x s32>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(-1,0,1,-1)
     RET_ReallyLR implicit %2
 ...
-
----
-name: shuffle_vector_scalar
-tracksRegLiveness: true
-body:             |
-  bb.1:
-    liveins: $x0, $x1
-
-    ; CHECK-LABEL: name: shuffle_vector_scalar
-    ; CHECK: liveins: $x0, $x1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[COPY]](s64), [[COPY]](s64), [[COPY]](s64), [[COPY]](s64)
-    ; CHECK-NEXT: RET_ReallyLR implicit [[BUILD_VECTOR]](<4 x s64>)
-    %0:_(s64) = COPY $x0
-    %1:_(s64) = COPY $x1
-    %2:_(<4 x s64>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(0, 0, 0, 0)
-    RET_ReallyLR implicit %2
-...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-undef-rhs.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-undef-rhs.mir
index 9bf7993..7a314ba 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-undef-rhs.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-undef-rhs.mir
@@ -20,23 +20,3 @@ body:             |
     %2:_(<4 x s32>) = G_SHUFFLE_VECTOR %0(<2 x s32>), %1(<2 x s32>), shufflemask(0, 2, 1, 3)
     RET_ReallyLR implicit %2
 ...
-
----
-name: shuffle_vector_undef_rhs_scalar
-tracksRegLiveness: true
-body:             |
-  bb.1:
-    liveins: $x0
-
-    ; CHECK-LABEL: name: shuffle_vector_undef_rhs_scalar
-    ; CHECK: liveins: $x0
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
-    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[COPY]](s64), [[DEF]](s64)
-    ; CHECK-NEXT: RET_ReallyLR implicit [[BUILD_VECTOR]](<2 x s64>)
-    %0:_(s64) = COPY $x0
-    %1:_(s64) = G_IMPLICIT_DEF
-    %2:_(<2 x s64>) = G_SHUFFLE_VECTOR %0(s64), %1(s64), shufflemask(0, 1)
-    RET_ReallyLR implicit %2
-...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector.mir
index 2c9ae5b..9013410 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector.mir
@@ -386,7 +386,7 @@ body:             |
     ; CHECK-NEXT: RET_ReallyLR implicit [[BUILD_VECTOR]](<4 x p0>)
     %0:_(p0) = COPY $x0
     %1:_(p0) = COPY $x1
-    %6:_(<4 x p0>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(0,1,0,1)
+    %6:_(<4 x p0>) = G_BUILD_VECTOR %0, %1, %0, %1
     RET_ReallyLR implicit %6
 ...
 
@@ -408,104 +408,6 @@ body:             |
     ; CHECK-NEXT: RET_ReallyLR implicit [[BUILD_VECTOR]](<2 x p0>)
     %0:_(p0) = COPY $x0
     %1:_(p0) = COPY $x1
-    %6:_(<2 x p0>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(1,0)
-    RET_ReallyLR implicit %6
-...
-
-# Check that we properly use undef values when shuffle_vector
-# on scalars gets lowered to build_vector.
----
-name: shuffle_vector_on_scalars_to_build_vector_with_undef
-tracksRegLiveness: true
-body:             |
-  bb.1:
-    liveins: $x0, $x1
-
-    ; CHECK-LABEL: name: shuffle_vector_on_scalars_to_build_vector_with_undef
-    ; CHECK: liveins: $x0, $x1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
-    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[COPY]](s64), [[DEF]](s64), [[DEF]](s64), [[COPY1]](s64)
-    ; CHECK-NEXT: RET_ReallyLR implicit [[BUILD_VECTOR]](<4 x s64>)
-    %0:_(s64) = COPY $x0
-    %1:_(s64) = COPY $x1
-    %6:_(<4 x s64>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(0,-1,-1,1)
-    RET_ReallyLR implicit %6
-...
-
-# Check that shuffle_vector on scalars gets combined into a plain
-# copy when the resulting type is a scalar as well and the sizes
-# are compatible.
----
-name: shuffle_vector_on_scalars_to_copy_ptr
-tracksRegLiveness: true
-body:             |
-  bb.1:
-    liveins: $x0
-
-    ; CHECK-LABEL: name: shuffle_vector_on_scalars_to_copy_ptr
-    ; CHECK: liveins: $x0
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
-    ; CHECK-NEXT: RET_ReallyLR implicit [[COPY]](p0)
-    %0:_(p0) = COPY $x0
-    %6:_(p0) = G_SHUFFLE_VECTOR %0, %0, shufflemask(0)
-    RET_ReallyLR implicit %6
-...
----
-name: shuffle_vector_to_copy_lhs
-tracksRegLiveness: true
-body:             |
-  bb.1:
-    liveins: $x0, $x1
-
-    ; CHECK-LABEL: name: shuffle_vector_to_copy_lhs
-    ; CHECK: liveins: $x0, $x1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $x0
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<2 x s32>), [[C]](s64)
-    ; CHECK-NEXT: RET_ReallyLR implicit [[EVEC]](s32)
-    %0:_(<2 x s32>) = COPY $x0
-    %1:_(<2 x s32>) = COPY $x1
-    %6:_(s32) = G_SHUFFLE_VECTOR %0, %1, shufflemask(1)
-    RET_ReallyLR implicit %6
-...
----
-name: shuffle_vector_to_copy_rhs
-tracksRegLiveness: true
-body:             |
-  bb.1:
-    liveins: $x0, $x1
-
-    ; CHECK-LABEL: name: shuffle_vector_to_copy_rhs
-    ; CHECK: liveins: $x0, $x1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $x1
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<2 x s32>), [[C]](s64)
-    ; CHECK-NEXT: RET_ReallyLR implicit [[EVEC]](s32)
-    %0:_(<2 x s32>) = COPY $x0
-    %1:_(<2 x s32>) = COPY $x1
-    %6:_(s32) = G_SHUFFLE_VECTOR %0, %1, shufflemask(2)
-    RET_ReallyLR implicit %6
-...
----
-name: shuffle_vector_to_copy_undef
-tracksRegLiveness: true
-body:             |
-  bb.1:
-    liveins: $x0, $x1
-
-    ; CHECK-LABEL: name: shuffle_vector_to_copy_undef
-    ; CHECK: liveins: $x0, $x1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: RET_ReallyLR implicit [[DEF]](s32)
-    %0:_(<2 x s32>) = COPY $x0
-    %1:_(<2 x s32>) = COPY $x1
-    %6:_(s32) = G_SHUFFLE_VECTOR %0, %1, shufflemask(-1)
+    %6:_(<2 x p0>) = G_BUILD_VECTOR %1, %0
     RET_ReallyLR implicit %6
 ...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-fp-use-def.mir b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-fp-use-def.mir
index b252884..46dbc15 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-fp-use-def.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-fp-use-def.mir
@@ -96,7 +96,7 @@ body:             |
     ; CHECK-NEXT: [[SITOFP:%[0-9]+]]:fpr(s32) = G_SITOFP [[COPY1]](s32)
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fpr(s32) = COPY [[COPY2]](s32)
     ; CHECK-NEXT: [[SELECT:%[0-9]+]]:fpr(s32) = G_SELECT [[COPY2]](s32), [[COPY3]], [[SITOFP]]
-    ; CHECK-NEXT: [[FPTOSI:%[0-9]+]]:gpr(s32) = G_FPTOSI [[SELECT]](s32)
+    ; CHECK-NEXT: [[FPTOSI:%[0-9]+]]:fpr(s32) = G_FPTOSI [[SELECT]](s32)
     %0:_(s32) = COPY $w0
     %2:_(s32) = COPY $w1
     %3:_(s32) = COPY $w2
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index 0933e67..b54f262 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -749,12 +749,429 @@ for.body:                                         ; preds = %for.body.preheader1
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
+define i64 @red_mla_dup_ext_u8_s8_s64(ptr noalias noundef readonly captures(none) %A, i8 noundef %B, i32 noundef %n) {
+; CHECK-SD-LABEL: red_mla_dup_ext_u8_s8_s64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-SD-NEXT:    cbz w2, .LBB6_3
+; CHECK-SD-NEXT:  // %bb.1: // %iter.check
+; CHECK-SD-NEXT:    str x25, [sp, #-64]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w23, -40
+; CHECK-SD-NEXT:    .cfi_offset w24, -48
+; CHECK-SD-NEXT:    .cfi_offset w25, -64
+; CHECK-SD-NEXT:    sxtb x9, w1
+; CHECK-SD-NEXT:    cmp w2, #3
+; CHECK-SD-NEXT:    mov w10, w2
+; CHECK-SD-NEXT:    b.hi .LBB6_4
+; CHECK-SD-NEXT:  // %bb.2:
+; CHECK-SD-NEXT:    mov x11, xzr
+; CHECK-SD-NEXT:    mov x8, xzr
+; CHECK-SD-NEXT:    b .LBB6_13
+; CHECK-SD-NEXT:  .LBB6_3:
+; CHECK-SD-NEXT:    mov x0, xzr
+; CHECK-SD-NEXT:    ret
+; CHECK-SD-NEXT:  .LBB6_4: // %vector.main.loop.iter.check
+; CHECK-SD-NEXT:    dup v0.2d, x9
+; CHECK-SD-NEXT:    cmp w2, #16
+; CHECK-SD-NEXT:    b.hs .LBB6_6
+; CHECK-SD-NEXT:  // %bb.5:
+; CHECK-SD-NEXT:    mov x11, xzr
+; CHECK-SD-NEXT:    mov x8, xzr
+; CHECK-SD-NEXT:    b .LBB6_10
+; CHECK-SD-NEXT:  .LBB6_6: // %vector.ph
+; CHECK-SD-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-SD-NEXT:    mov x8, v0.d[1]
+; CHECK-SD-NEXT:    and x12, x10, #0xc
+; CHECK-SD-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-SD-NEXT:    and x11, x10, #0xfffffff0
+; CHECK-SD-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v7.2d, #0000000000000000
+; CHECK-SD-NEXT:    mov x15, x0
+; CHECK-SD-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v16.2d, #0000000000000000
+; CHECK-SD-NEXT:    and x16, x10, #0xfffffff0
+; CHECK-SD-NEXT:    movi v6.2d, #0000000000000000
+; CHECK-SD-NEXT:    fmov x13, d0
+; CHECK-SD-NEXT:    fmov x14, d0
+; CHECK-SD-NEXT:  .LBB6_7: // %vector.body
+; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-SD-NEXT:    ldr q17, [x15], #16
+; CHECK-SD-NEXT:    subs x16, x16, #16
+; CHECK-SD-NEXT:    ushll v18.8h, v17.8b, #0
+; CHECK-SD-NEXT:    ushll2 v19.8h, v17.16b, #0
+; CHECK-SD-NEXT:    ushll v17.4s, v18.4h, #0
+; CHECK-SD-NEXT:    ushll2 v20.4s, v19.8h, #0
+; CHECK-SD-NEXT:    ushll2 v18.4s, v18.8h, #0
+; CHECK-SD-NEXT:    ushll v19.4s, v19.4h, #0
+; CHECK-SD-NEXT:    ushll v21.2d, v17.2s, #0
+; CHECK-SD-NEXT:    ushll2 v22.2d, v20.4s, #0
+; CHECK-SD-NEXT:    ushll2 v17.2d, v17.4s, #0
+; CHECK-SD-NEXT:    ushll v23.2d, v18.2s, #0
+; CHECK-SD-NEXT:    ushll v20.2d, v20.2s, #0
+; CHECK-SD-NEXT:    ushll2 v18.2d, v18.4s, #0
+; CHECK-SD-NEXT:    fmov x17, d21
+; CHECK-SD-NEXT:    mov x2, v21.d[1]
+; CHECK-SD-NEXT:    ushll v21.2d, v19.2s, #0
+; CHECK-SD-NEXT:    ushll2 v19.2d, v19.4s, #0
+; CHECK-SD-NEXT:    fmov x18, d22
+; CHECK-SD-NEXT:    fmov x1, d17
+; CHECK-SD-NEXT:    fmov x3, d23
+; CHECK-SD-NEXT:    fmov x21, d20
+; CHECK-SD-NEXT:    fmov x22, d18
+; CHECK-SD-NEXT:    fmov x19, d21
+; CHECK-SD-NEXT:    mul x17, x13, x17
+; CHECK-SD-NEXT:    mov x4, v22.d[1]
+; CHECK-SD-NEXT:    fmov x24, d19
+; CHECK-SD-NEXT:    mov x5, v23.d[1]
+; CHECK-SD-NEXT:    mov x6, v21.d[1]
+; CHECK-SD-NEXT:    mov x7, v20.d[1]
+; CHECK-SD-NEXT:    mov x20, v18.d[1]
+; CHECK-SD-NEXT:    mov x23, v19.d[1]
+; CHECK-SD-NEXT:    mov x25, v17.d[1]
+; CHECK-SD-NEXT:    mul x18, x14, x18
+; CHECK-SD-NEXT:    mul x1, x13, x1
+; CHECK-SD-NEXT:    fmov d17, x17
+; CHECK-SD-NEXT:    mul x3, x13, x3
+; CHECK-SD-NEXT:    fmov d18, x18
+; CHECK-SD-NEXT:    mul x19, x13, x19
+; CHECK-SD-NEXT:    fmov d19, x1
+; CHECK-SD-NEXT:    mul x21, x13, x21
+; CHECK-SD-NEXT:    fmov d20, x3
+; CHECK-SD-NEXT:    mul x22, x13, x22
+; CHECK-SD-NEXT:    fmov d21, x19
+; CHECK-SD-NEXT:    mul x24, x13, x24
+; CHECK-SD-NEXT:    fmov d24, x21
+; CHECK-SD-NEXT:    mul x2, x8, x2
+; CHECK-SD-NEXT:    fmov d22, x22
+; CHECK-SD-NEXT:    mul x4, x8, x4
+; CHECK-SD-NEXT:    fmov d23, x24
+; CHECK-SD-NEXT:    mul x5, x8, x5
+; CHECK-SD-NEXT:    mov v17.d[1], x2
+; CHECK-SD-NEXT:    mul x6, x8, x6
+; CHECK-SD-NEXT:    mov v18.d[1], x4
+; CHECK-SD-NEXT:    mul x7, x8, x7
+; CHECK-SD-NEXT:    mov v20.d[1], x5
+; CHECK-SD-NEXT:    add v1.2d, v17.2d, v1.2d
+; CHECK-SD-NEXT:    mul x20, x8, x20
+; CHECK-SD-NEXT:    mov v21.d[1], x6
+; CHECK-SD-NEXT:    add v6.2d, v18.2d, v6.2d
+; CHECK-SD-NEXT:    mul x23, x8, x23
+; CHECK-SD-NEXT:    mov v24.d[1], x7
+; CHECK-SD-NEXT:    add v4.2d, v20.2d, v4.2d
+; CHECK-SD-NEXT:    mul x17, x8, x25
+; CHECK-SD-NEXT:    mov v22.d[1], x20
+; CHECK-SD-NEXT:    add v7.2d, v21.2d, v7.2d
+; CHECK-SD-NEXT:    mov v23.d[1], x23
+; CHECK-SD-NEXT:    add v16.2d, v24.2d, v16.2d
+; CHECK-SD-NEXT:    mov v19.d[1], x17
+; CHECK-SD-NEXT:    add v3.2d, v22.2d, v3.2d
+; CHECK-SD-NEXT:    add v5.2d, v23.2d, v5.2d
+; CHECK-SD-NEXT:    add v2.2d, v19.2d, v2.2d
+; CHECK-SD-NEXT:    b.ne .LBB6_7
+; CHECK-SD-NEXT:  // %bb.8: // %middle.block
+; CHECK-SD-NEXT:    add v1.2d, v1.2d, v7.2d
+; CHECK-SD-NEXT:    add v4.2d, v4.2d, v16.2d
+; CHECK-SD-NEXT:    cmp x11, x10
+; CHECK-SD-NEXT:    add v2.2d, v2.2d, v5.2d
+; CHECK-SD-NEXT:    add v3.2d, v3.2d, v6.2d
+; CHECK-SD-NEXT:    add v1.2d, v1.2d, v4.2d
+; CHECK-SD-NEXT:    add v2.2d, v2.2d, v3.2d
+; CHECK-SD-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-SD-NEXT:    addp d1, v1.2d
+; CHECK-SD-NEXT:    fmov x8, d1
+; CHECK-SD-NEXT:    b.eq .LBB6_15
+; CHECK-SD-NEXT:  // %bb.9: // %vec.epilog.iter.check
+; CHECK-SD-NEXT:    cbz x12, .LBB6_13
+; CHECK-SD-NEXT:  .LBB6_10: // %vec.epilog.ph
+; CHECK-SD-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-SD-NEXT:    mov x13, x11
+; CHECK-SD-NEXT:    movi v3.2d, #0x000000000000ff
+; CHECK-SD-NEXT:    fmov x14, d0
+; CHECK-SD-NEXT:    and x11, x10, #0xfffffffc
+; CHECK-SD-NEXT:    fmov x15, d0
+; CHECK-SD-NEXT:    sub x12, x13, x11
+; CHECK-SD-NEXT:    add x13, x0, x13
+; CHECK-SD-NEXT:    mov v1.d[0], x8
+; CHECK-SD-NEXT:    mov x8, v0.d[1]
+; CHECK-SD-NEXT:  .LBB6_11: // %vec.epilog.vector.body
+; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-SD-NEXT:    ldr s0, [x13], #4
+; CHECK-SD-NEXT:    adds x12, x12, #4
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v4.2d, v0.2s, #0
+; CHECK-SD-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-SD-NEXT:    and v4.16b, v4.16b, v3.16b
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-SD-NEXT:    fmov x16, d4
+; CHECK-SD-NEXT:    fmov x18, d0
+; CHECK-SD-NEXT:    mov x17, v4.d[1]
+; CHECK-SD-NEXT:    mov x1, v0.d[1]
+; CHECK-SD-NEXT:    mul x16, x14, x16
+; CHECK-SD-NEXT:    mul x18, x15, x18
+; CHECK-SD-NEXT:    mul x17, x8, x17
+; CHECK-SD-NEXT:    fmov d0, x16
+; CHECK-SD-NEXT:    mul x1, x8, x1
+; CHECK-SD-NEXT:    fmov d4, x18
+; CHECK-SD-NEXT:    mov v0.d[1], x17
+; CHECK-SD-NEXT:    mov v4.d[1], x1
+; CHECK-SD-NEXT:    add v1.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    add v2.2d, v4.2d, v2.2d
+; CHECK-SD-NEXT:    b.ne .LBB6_11
+; CHECK-SD-NEXT:  // %bb.12: // %vec.epilog.middle.block
+; CHECK-SD-NEXT:    add v0.2d, v1.2d, v2.2d
+; CHECK-SD-NEXT:    cmp x11, x10
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x8, d0
+; CHECK-SD-NEXT:    b.eq .LBB6_15
+; CHECK-SD-NEXT:  .LBB6_13: // %for.body.preheader
+; CHECK-SD-NEXT:    sub x10, x10, x11
+; CHECK-SD-NEXT:    add x11, x0, x11
+; CHECK-SD-NEXT:  .LBB6_14: // %for.body
+; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-SD-NEXT:    ldrb w12, [x11], #1
+; CHECK-SD-NEXT:    subs x10, x10, #1
+; CHECK-SD-NEXT:    smaddl x8, w12, w9, x8
+; CHECK-SD-NEXT:    b.ne .LBB6_14
+; CHECK-SD-NEXT:  .LBB6_15:
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x25, [sp], #64 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    mov x0, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: red_mla_dup_ext_u8_s8_s64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-GI-NEXT:    cbz w2, .LBB6_7
+; CHECK-GI-NEXT:  // %bb.1: // %iter.check
+; CHECK-GI-NEXT:    movi d0, #0000000000000000
+; CHECK-GI-NEXT:    sxtb x9, w1
+; CHECK-GI-NEXT:    mov x11, xzr
+; CHECK-GI-NEXT:    cmp w2, #4
+; CHECK-GI-NEXT:    mov w10, w2
+; CHECK-GI-NEXT:    b.lo .LBB6_12
+; CHECK-GI-NEXT:  // %bb.2: // %vector.main.loop.iter.check
+; CHECK-GI-NEXT:    movi d0, #0000000000000000
+; CHECK-GI-NEXT:    dup v1.2d, x9
+; CHECK-GI-NEXT:    mov x11, xzr
+; CHECK-GI-NEXT:    cmp w2, #16
+; CHECK-GI-NEXT:    b.lo .LBB6_9
+; CHECK-GI-NEXT:  // %bb.3: // %vector.ph
+; CHECK-GI-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-GI-NEXT:    xtn v2.2s, v1.2d
+; CHECK-GI-NEXT:    and x8, x10, #0xc
+; CHECK-GI-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-GI-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-GI-NEXT:    and x11, x10, #0xfffffff0
+; CHECK-GI-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-GI-NEXT:    movi v6.2d, #0000000000000000
+; CHECK-GI-NEXT:    mov x12, x0
+; CHECK-GI-NEXT:    movi v7.2d, #0000000000000000
+; CHECK-GI-NEXT:    movi v16.2d, #0000000000000000
+; CHECK-GI-NEXT:    and x13, x10, #0xfffffff0
+; CHECK-GI-NEXT:    movi v17.2d, #0000000000000000
+; CHECK-GI-NEXT:  .LBB6_4: // %vector.body
+; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-GI-NEXT:    ldr q18, [x12], #16
+; CHECK-GI-NEXT:    subs x13, x13, #16
+; CHECK-GI-NEXT:    ushll v19.8h, v18.8b, #0
+; CHECK-GI-NEXT:    ushll2 v18.8h, v18.16b, #0
+; CHECK-GI-NEXT:    ushll v20.4s, v19.4h, #0
+; CHECK-GI-NEXT:    ushll2 v19.4s, v19.8h, #0
+; CHECK-GI-NEXT:    ushll v21.4s, v18.4h, #0
+; CHECK-GI-NEXT:    ushll2 v18.4s, v18.8h, #0
+; CHECK-GI-NEXT:    mov d22, v20.d[1]
+; CHECK-GI-NEXT:    mov d23, v19.d[1]
+; CHECK-GI-NEXT:    mov d24, v21.d[1]
+; CHECK-GI-NEXT:    mov d25, v18.d[1]
+; CHECK-GI-NEXT:    smlal v0.2d, v2.2s, v20.2s
+; CHECK-GI-NEXT:    smlal v4.2d, v2.2s, v19.2s
+; CHECK-GI-NEXT:    smlal v6.2d, v2.2s, v21.2s
+; CHECK-GI-NEXT:    smlal v16.2d, v2.2s, v18.2s
+; CHECK-GI-NEXT:    smlal v3.2d, v2.2s, v22.2s
+; CHECK-GI-NEXT:    smlal v5.2d, v2.2s, v23.2s
+; CHECK-GI-NEXT:    smlal v7.2d, v2.2s, v24.2s
+; CHECK-GI-NEXT:    smlal v17.2d, v2.2s, v25.2s
+; CHECK-GI-NEXT:    b.ne .LBB6_4
+; CHECK-GI-NEXT:  // %bb.5: // %middle.block
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v3.2d
+; CHECK-GI-NEXT:    add v2.2d, v4.2d, v5.2d
+; CHECK-GI-NEXT:    cmp x11, x10
+; CHECK-GI-NEXT:    add v3.2d, v6.2d, v7.2d
+; CHECK-GI-NEXT:    add v4.2d, v16.2d, v17.2d
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    add v2.2d, v3.2d, v4.2d
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    b.ne .LBB6_8
+; CHECK-GI-NEXT:  // %bb.6:
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    mov x0, x8
+; CHECK-GI-NEXT:    ret
+; CHECK-GI-NEXT:  .LBB6_7:
+; CHECK-GI-NEXT:    mov x8, xzr
+; CHECK-GI-NEXT:    mov x0, x8
+; CHECK-GI-NEXT:    ret
+; CHECK-GI-NEXT:  .LBB6_8: // %vec.epilog.iter.check
+; CHECK-GI-NEXT:    cbz x8, .LBB6_12
+; CHECK-GI-NEXT:  .LBB6_9: // %vec.epilog.ph
+; CHECK-GI-NEXT:    mov v0.d[1], xzr
+; CHECK-GI-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-NEXT:    mov x12, x11
+; CHECK-GI-NEXT:    xtn v1.2s, v1.2d
+; CHECK-GI-NEXT:    and x11, x10, #0xfffffffc
+; CHECK-GI-NEXT:    sub x8, x12, x11
+; CHECK-GI-NEXT:    add x12, x0, x12
+; CHECK-GI-NEXT:  .LBB6_10: // %vec.epilog.vector.body
+; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-GI-NEXT:    ldr w13, [x12], #4
+; CHECK-GI-NEXT:    adds x8, x8, #4
+; CHECK-GI-NEXT:    fmov s3, w13
+; CHECK-GI-NEXT:    uxtb w13, w13
+; CHECK-GI-NEXT:    mov b4, v3.b[2]
+; CHECK-GI-NEXT:    mov b5, v3.b[1]
+; CHECK-GI-NEXT:    mov b6, v3.b[3]
+; CHECK-GI-NEXT:    fmov s3, w13
+; CHECK-GI-NEXT:    fmov w14, s4
+; CHECK-GI-NEXT:    fmov w15, s5
+; CHECK-GI-NEXT:    fmov w16, s6
+; CHECK-GI-NEXT:    uxtb w14, w14
+; CHECK-GI-NEXT:    uxtb w15, w15
+; CHECK-GI-NEXT:    uxtb w16, w16
+; CHECK-GI-NEXT:    fmov s4, w14
+; CHECK-GI-NEXT:    mov v3.s[1], w15
+; CHECK-GI-NEXT:    mov v4.s[1], w16
+; CHECK-GI-NEXT:    smlal v0.2d, v1.2s, v3.2s
+; CHECK-GI-NEXT:    smlal v2.2d, v1.2s, v4.2s
+; CHECK-GI-NEXT:    b.ne .LBB6_10
+; CHECK-GI-NEXT:  // %bb.11: // %vec.epilog.middle.block
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    cmp x11, x10
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    b.eq .LBB6_14
+; CHECK-GI-NEXT:  .LBB6_12: // %for.body.preheader
+; CHECK-GI-NEXT:    sub x10, x10, x11
+; CHECK-GI-NEXT:    add x11, x0, x11
+; CHECK-GI-NEXT:  .LBB6_13: // %for.body
+; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-GI-NEXT:    ldrb w8, [x11], #1
+; CHECK-GI-NEXT:    fmov x12, d0
+; CHECK-GI-NEXT:    subs x10, x10, #1
+; CHECK-GI-NEXT:    madd x8, x8, x9, x12
+; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    b.ne .LBB6_13
+; CHECK-GI-NEXT:  .LBB6_14: // %for.cond.cleanup
+; CHECK-GI-NEXT:    mov x0, x8
+; CHECK-GI-NEXT:    ret
+entry:
+  %cmp5.not = icmp eq i32 %n, 0
+  br i1 %cmp5.not, label %for.cond.cleanup, label %iter.check
+
+iter.check:                                       ; preds = %entry
+  %conv1 = sext i8 %B to i64
+  %wide.trip.count = zext i32 %n to i64
+  %min.iters.check = icmp ult i32 %n, 4
+  br i1 %min.iters.check, label %for.body.preheader, label %vector.main.loop.iter.check
+
+vector.main.loop.iter.check:                      ; preds = %iter.check
+  %min.iters.check9 = icmp ult i32 %n, 16
+  br i1 %min.iters.check9, label %vec.epilog.ph, label %vector.ph
+
+vector.ph:                                        ; preds = %vector.main.loop.iter.check
+  %n.mod.vf = and i64 %wide.trip.count, 12
+  %n.vec = and i64 %wide.trip.count, 4294967280
+  %broadcast.splatinsert = insertelement <16 x i64> poison, i64 %conv1, i64 0
+  %broadcast.splat = shufflevector <16 x i64> %broadcast.splatinsert, <16 x i64> poison, <16 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <16 x i64> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
+  %0 = getelementptr inbounds nuw i8, ptr %A, i64 %index
+  %wide.load = load <16 x i8>, ptr %0, align 1
+  %1 = zext <16 x i8> %wide.load to <16 x i64>
+  %2 = mul nsw <16 x i64> %broadcast.splat, %1
+  %3 = add <16 x i64> %2, %vec.phi
+  %index.next = add nuw i64 %index, 16
+  %4 = icmp eq i64 %index.next, %n.vec
+  br i1 %4, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %5 = tail call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %3)
+  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
+  br i1 %cmp.n, label %for.cond.cleanup, label %vec.epilog.iter.check
+
+vec.epilog.iter.check:                            ; preds = %middle.block
+  %min.epilog.iters.check = icmp eq i64 %n.mod.vf, 0
+  br i1 %min.epilog.iters.check, label %for.body.preheader, label %vec.epilog.ph
+
+vec.epilog.ph:                                    ; preds = %vector.main.loop.iter.check, %vec.epilog.iter.check
+  %vec.epilog.resume.val = phi i64 [ %n.vec, %vec.epilog.iter.check ], [ 0, %vector.main.loop.iter.check ]
+  %bc.merge.rdx = phi i64 [ %5, %vec.epilog.iter.check ], [ 0, %vector.main.loop.iter.check ]
+  %n.vec11 = and i64 %wide.trip.count, 4294967292
+  %6 = insertelement <4 x i64> <i64 poison, i64 0, i64 0, i64 0>, i64 %bc.merge.rdx, i64 0
+  %broadcast.splatinsert12 = insertelement <4 x i64> poison, i64 %conv1, i64 0
+  %broadcast.splat13 = shufflevector <4 x i64> %broadcast.splatinsert12, <4 x i64> poison, <4 x i32> zeroinitializer
+  br label %vec.epilog.vector.body
+
+vec.epilog.vector.body:                           ; preds = %vec.epilog.vector.body, %vec.epilog.ph
+  %index14 = phi i64 [ %vec.epilog.resume.val, %vec.epilog.ph ], [ %index.next17, %vec.epilog.vector.body ]
+  %vec.phi15 = phi <4 x i64> [ %6, %vec.epilog.ph ], [ %10, %vec.epilog.vector.body ]
+  %7 = getelementptr inbounds nuw i8, ptr %A, i64 %index14
+  %wide.load16 = load <4 x i8>, ptr %7, align 1
+  %8 = zext <4 x i8> %wide.load16 to <4 x i64>
+  %9 = mul nsw <4 x i64> %broadcast.splat13, %8
+  %10 = add <4 x i64> %9, %vec.phi15
+  %index.next17 = add nuw i64 %index14, 4
+  %11 = icmp eq i64 %index.next17, %n.vec11
+  br i1 %11, label %vec.epilog.middle.block, label %vec.epilog.vector.body
+
+vec.epilog.middle.block:                          ; preds = %vec.epilog.vector.body
+  %12 = tail call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %10)
+  %cmp.n18 = icmp eq i64 %n.vec11, %wide.trip.count
+  br i1 %cmp.n18, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %iter.check, %vec.epilog.iter.check, %vec.epilog.middle.block
+  %indvars.iv.ph = phi i64 [ 0, %iter.check ], [ %n.vec, %vec.epilog.iter.check ], [ %n.vec11, %vec.epilog.middle.block ]
+  %s.06.ph = phi i64 [ 0, %iter.check ], [ %5, %vec.epilog.iter.check ], [ %12, %vec.epilog.middle.block ]
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %vec.epilog.middle.block, %entry
+  %s.0.lcssa = phi i64 [ 0, %entry ], [ %5, %middle.block ], [ %12, %vec.epilog.middle.block ], [ %add, %for.body ]
+  ret i64 %s.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader ]
+  %s.06 = phi i64 [ %add, %for.body ], [ %s.06.ph, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds nuw i8, ptr %A, i64 %indvars.iv
+  %13 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %13 to i64
+  %mul = mul nsw i64 %conv, %conv1
+  %add = add nsw i64 %mul, %s.06
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
 define void @sink_v2z64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
 ; CHECK-SD-LABEL: sink_v2z64_1:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    mov x8, xzr
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:  .LBB6_1: // %loop
+; CHECK-SD-NEXT:  .LBB7_1: // %loop
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    ldr d1, [x0]
 ; CHECK-SD-NEXT:    subs x2, x2, #8
@@ -762,7 +1179,7 @@ define void @sink_v2z64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
 ; CHECK-SD-NEXT:    umull v1.2d, v1.2s, v0.s[1]
 ; CHECK-SD-NEXT:    shrn v1.2s, v1.2d, #15
 ; CHECK-SD-NEXT:    str d1, [x0], #32
-; CHECK-SD-NEXT:    b.ne .LBB6_1
+; CHECK-SD-NEXT:    b.ne .LBB7_1
 ; CHECK-SD-NEXT:  // %bb.2: // %exit
 ; CHECK-SD-NEXT:    ret
 ;
@@ -772,7 +1189,7 @@ define void @sink_v2z64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
 ; CHECK-GI-NEXT:    mov x8, xzr
 ; CHECK-GI-NEXT:    dup v0.2d, v0.d[1]
 ; CHECK-GI-NEXT:    xtn v0.2s, v0.2d
-; CHECK-GI-NEXT:  .LBB6_1: // %loop
+; CHECK-GI-NEXT:  .LBB7_1: // %loop
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    ldr d1, [x0]
 ; CHECK-GI-NEXT:    subs x2, x2, #8
@@ -780,7 +1197,7 @@ define void @sink_v2z64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
 ; CHECK-GI-NEXT:    umull v1.2d, v1.2s, v0.2s
 ; CHECK-GI-NEXT:    shrn v1.2s, v1.2d, #15
 ; CHECK-GI-NEXT:    str d1, [x0], #32
-; CHECK-GI-NEXT:    b.ne .LBB6_1
+; CHECK-GI-NEXT:    b.ne .LBB7_1
 ; CHECK-GI-NEXT:  // %bb.2: // %exit
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -813,7 +1230,7 @@ define void @sink_v4i64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    mov x8, xzr
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:  .LBB7_1: // %loop
+; CHECK-SD-NEXT:  .LBB8_1: // %loop
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    ldr q1, [x0]
 ; CHECK-SD-NEXT:    subs x2, x2, #8
@@ -823,7 +1240,7 @@ define void @sink_v4i64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
 ; CHECK-SD-NEXT:    shrn v2.2s, v2.2d, #15
 ; CHECK-SD-NEXT:    shrn2 v2.4s, v1.2d, #15
 ; CHECK-SD-NEXT:    str q2, [x0], #32
-; CHECK-SD-NEXT:    b.ne .LBB7_1
+; CHECK-SD-NEXT:    b.ne .LBB8_1
 ; CHECK-SD-NEXT:  // %bb.2: // %exit
 ; CHECK-SD-NEXT:    ret
 ;
@@ -833,7 +1250,7 @@ define void @sink_v4i64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
 ; CHECK-GI-NEXT:    mov x8, xzr
 ; CHECK-GI-NEXT:    dup v0.2d, v0.d[1]
 ; CHECK-GI-NEXT:    xtn v0.2s, v0.2d
-; CHECK-GI-NEXT:  .LBB7_1: // %loop
+; CHECK-GI-NEXT:  .LBB8_1: // %loop
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    ldr q1, [x0]
 ; CHECK-GI-NEXT:    subs x2, x2, #8
@@ -844,7 +1261,7 @@ define void @sink_v4i64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
 ; CHECK-GI-NEXT:    shrn v1.2s, v1.2d, #15
 ; CHECK-GI-NEXT:    shrn2 v1.4s, v2.2d, #15
 ; CHECK-GI-NEXT:    str q1, [x0], #32
-; CHECK-GI-NEXT:    b.ne .LBB7_1
+; CHECK-GI-NEXT:    b.ne .LBB8_1
 ; CHECK-GI-NEXT:  // %bb.2: // %exit
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -877,7 +1294,7 @@ define void @sink_v8z16_0(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    dup v0.8b, v0.b[0]
 ; CHECK-SD-NEXT:    mov x8, xzr
-; CHECK-SD-NEXT:  .LBB8_1: // %loop
+; CHECK-SD-NEXT:  .LBB9_1: // %loop
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    ldr d1, [x0]
 ; CHECK-SD-NEXT:    subs x2, x2, #8
@@ -886,7 +1303,7 @@ define void @sink_v8z16_0(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
 ; CHECK-SD-NEXT:    cmlt v1.8h, v1.8h, #0
 ; CHECK-SD-NEXT:    xtn v1.8b, v1.8h
 ; CHECK-SD-NEXT:    str d1, [x0], #32
-; CHECK-SD-NEXT:    b.ne .LBB8_1
+; CHECK-SD-NEXT:    b.ne .LBB9_1
 ; CHECK-SD-NEXT:  // %bb.2: // %exit
 ; CHECK-SD-NEXT:    ret
 ;
@@ -896,7 +1313,7 @@ define void @sink_v8z16_0(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
 ; CHECK-GI-NEXT:    mov x8, xzr
 ; CHECK-GI-NEXT:    dup v0.8h, v0.h[0]
 ; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
-; CHECK-GI-NEXT:  .LBB8_1: // %loop
+; CHECK-GI-NEXT:  .LBB9_1: // %loop
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    ldr d1, [x0]
 ; CHECK-GI-NEXT:    subs x2, x2, #8
@@ -905,7 +1322,7 @@ define void @sink_v8z16_0(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
 ; CHECK-GI-NEXT:    cmlt v1.8h, v1.8h, #0
 ; CHECK-GI-NEXT:    xtn v1.8b, v1.8h
 ; CHECK-GI-NEXT:    str d1, [x0], #32
-; CHECK-GI-NEXT:    b.ne .LBB8_1
+; CHECK-GI-NEXT:    b.ne .LBB9_1
 ; CHECK-GI-NEXT:  // %bb.2: // %exit
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -938,7 +1355,7 @@ define void @sink_v16s16_8(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    dup v0.16b, v0.b[10]
 ; CHECK-SD-NEXT:    mov x8, xzr
-; CHECK-SD-NEXT:  .LBB9_1: // %loop
+; CHECK-SD-NEXT:  .LBB10_1: // %loop
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    ldr q1, [x0]
 ; CHECK-SD-NEXT:    subs x2, x2, #8
@@ -949,7 +1366,7 @@ define void @sink_v16s16_8(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
 ; CHECK-SD-NEXT:    cmlt v2.8h, v2.8h, #0
 ; CHECK-SD-NEXT:    uzp1 v1.16b, v2.16b, v1.16b
 ; CHECK-SD-NEXT:    str q1, [x0], #32
-; CHECK-SD-NEXT:    b.ne .LBB9_1
+; CHECK-SD-NEXT:    b.ne .LBB10_1
 ; CHECK-SD-NEXT:  // %bb.2: // %exit
 ; CHECK-SD-NEXT:    ret
 ;
@@ -959,7 +1376,7 @@ define void @sink_v16s16_8(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
 ; CHECK-GI-NEXT:    mov x8, xzr
 ; CHECK-GI-NEXT:    dup v0.8h, v0.h[2]
 ; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
-; CHECK-GI-NEXT:  .LBB9_1: // %loop
+; CHECK-GI-NEXT:  .LBB10_1: // %loop
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    ldr q1, [x0]
 ; CHECK-GI-NEXT:    subs x2, x2, #8
@@ -971,7 +1388,7 @@ define void @sink_v16s16_8(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
 ; CHECK-GI-NEXT:    cmlt v2.8h, v2.8h, #0
 ; CHECK-GI-NEXT:    uzp1 v1.16b, v1.16b, v2.16b
 ; CHECK-GI-NEXT:    str q1, [x0], #32
-; CHECK-GI-NEXT:    b.ne .LBB9_1
+; CHECK-GI-NEXT:    b.ne .LBB10_1
 ; CHECK-GI-NEXT:  // %bb.2: // %exit
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -1005,7 +1422,7 @@ define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture rea
 ; CHECK-SD-NEXT:    dup v0.4h, w3
 ; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SD-NEXT:    and x8, x0, #0xfffffff8
-; CHECK-SD-NEXT:  .LBB10_1: // %vector.body
+; CHECK-SD-NEXT:  .LBB11_1: // %vector.body
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    add x9, x2, w0, uxtw #1
 ; CHECK-SD-NEXT:    subs x8, x8, #8
@@ -1015,7 +1432,7 @@ define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture rea
 ; CHECK-SD-NEXT:    umull v1.4s, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    umull v2.4s, v0.4h, v2.4h
 ; CHECK-SD-NEXT:    stp q1, q2, [x9]
-; CHECK-SD-NEXT:    b.ne .LBB10_1
+; CHECK-SD-NEXT:    b.ne .LBB11_1
 ; CHECK-SD-NEXT:  // %bb.2: // %for.end12
 ; CHECK-SD-NEXT:    ret
 ;
@@ -1026,7 +1443,7 @@ define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture rea
 ; CHECK-GI-NEXT:    mov w8, w0
 ; CHECK-GI-NEXT:    and x8, x8, #0xfffffff8
 ; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
-; CHECK-GI-NEXT:  .LBB10_1: // %vector.body
+; CHECK-GI-NEXT:  .LBB11_1: // %vector.body
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    add x9, x2, w0, uxtw #1
 ; CHECK-GI-NEXT:    subs x8, x8, #8
@@ -1036,7 +1453,7 @@ define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture rea
 ; CHECK-GI-NEXT:    umull v1.4s, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    umull v2.4s, v0.4h, v2.4h
 ; CHECK-GI-NEXT:    stp q1, q2, [x9]
-; CHECK-GI-NEXT:    b.ne .LBB10_1
+; CHECK-GI-NEXT:    b.ne .LBB11_1
 ; CHECK-GI-NEXT:  // %bb.2: // %for.end12
 ; CHECK-GI-NEXT:    ret
 vector.header:
@@ -1089,7 +1506,7 @@ define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapt
 ; CHECK-SD-NEXT:    dup v0.8h, w3
 ; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SD-NEXT:    and x8, x0, #0xfffffff0
-; CHECK-SD-NEXT:  .LBB11_1: // %vector.body
+; CHECK-SD-NEXT:  .LBB12_1: // %vector.body
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    add x9, x2, w0, uxtw #1
 ; CHECK-SD-NEXT:    subs x8, x8, #16
@@ -1103,7 +1520,7 @@ define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapt
 ; CHECK-SD-NEXT:    umull v2.4s, v0.4h, v2.4h
 ; CHECK-SD-NEXT:    stp q1, q3, [x9]
 ; CHECK-SD-NEXT:    stp q2, q4, [x9, #32]
-; CHECK-SD-NEXT:    b.ne .LBB11_1
+; CHECK-SD-NEXT:    b.ne .LBB12_1
 ; CHECK-SD-NEXT:  // %bb.2: // %for.end12
 ; CHECK-SD-NEXT:    ret
 ;
@@ -1114,7 +1531,7 @@ define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapt
 ; CHECK-GI-NEXT:    mov w8, w0
 ; CHECK-GI-NEXT:    and x8, x8, #0xfffffff0
 ; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
-; CHECK-GI-NEXT:  .LBB11_1: // %vector.body
+; CHECK-GI-NEXT:  .LBB12_1: // %vector.body
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    add x9, x2, w0, uxtw #1
 ; CHECK-GI-NEXT:    subs x8, x8, #16
@@ -1130,7 +1547,7 @@ define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapt
 ; CHECK-GI-NEXT:    umull v4.4s, v0.4h, v4.4h
 ; CHECK-GI-NEXT:    stp q1, q3, [x9]
 ; CHECK-GI-NEXT:    stp q2, q4, [x9, #32]!
-; CHECK-GI-NEXT:    b.ne .LBB11_1
+; CHECK-GI-NEXT:    b.ne .LBB12_1
 ; CHECK-GI-NEXT:  // %bb.2: // %for.end12
 ; CHECK-GI-NEXT:    ret
 vector.header:
@@ -1184,7 +1601,7 @@ define void @matrix_mul_signed_and(i32 %N, ptr nocapture %C, ptr nocapture reado
 ; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SD-NEXT:    and x8, x0, #0xfffffff8
 ; CHECK-SD-NEXT:    fmov s0, w9
-; CHECK-SD-NEXT:  .LBB12_1: // %vector.body
+; CHECK-SD-NEXT:  .LBB13_1: // %vector.body
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    add x9, x2, w0, uxtw #1
 ; CHECK-SD-NEXT:    subs x8, x8, #8
@@ -1196,7 +1613,7 @@ define void @matrix_mul_signed_and(i32 %N, ptr nocapture %C, ptr nocapture reado
 ; CHECK-SD-NEXT:    mul v1.4s, v1.4s, v0.s[0]
 ; CHECK-SD-NEXT:    mul v2.4s, v2.4s, v0.s[0]
 ; CHECK-SD-NEXT:    stp q1, q2, [x9]
-; CHECK-SD-NEXT:    b.ne .LBB12_1
+; CHECK-SD-NEXT:    b.ne .LBB13_1
 ; CHECK-SD-NEXT:  // %bb.2: // %for.end12
 ; CHECK-SD-NEXT:    ret
 ;
@@ -1206,7 +1623,7 @@ define void @matrix_mul_signed_and(i32 %N, ptr nocapture %C, ptr nocapture reado
 ; CHECK-GI-NEXT:    dup v0.4s, w8
 ; CHECK-GI-NEXT:    mov w8, w0
 ; CHECK-GI-NEXT:    and x8, x8, #0xfffffff8
-; CHECK-GI-NEXT:  .LBB12_1: // %vector.body
+; CHECK-GI-NEXT:  .LBB13_1: // %vector.body
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    add x9, x2, w0, uxtw #1
 ; CHECK-GI-NEXT:    subs x8, x8, #8
@@ -1218,7 +1635,7 @@ define void @matrix_mul_signed_and(i32 %N, ptr nocapture %C, ptr nocapture reado
 ; CHECK-GI-NEXT:    mul v1.4s, v0.4s, v1.4s
 ; CHECK-GI-NEXT:    mul v2.4s, v0.4s, v2.4s
 ; CHECK-GI-NEXT:    stp q1, q2, [x9]
-; CHECK-GI-NEXT:    b.ne .LBB12_1
+; CHECK-GI-NEXT:    b.ne .LBB13_1
 ; CHECK-GI-NEXT:  // %bb.2: // %for.end12
 ; CHECK-GI-NEXT:    ret
 vector.header:
@@ -1272,7 +1689,7 @@ define void @matrix_mul_signed_and_double(i32 %N, ptr nocapture %C, ptr nocaptur
 ; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SD-NEXT:    and x8, x0, #0xfffffff0
 ; CHECK-SD-NEXT:    fmov s0, w9
-; CHECK-SD-NEXT:  .LBB13_1: // %vector.body
+; CHECK-SD-NEXT:  .LBB14_1: // %vector.body
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    add x9, x2, w0, uxtw #1
 ; CHECK-SD-NEXT:    subs x8, x8, #16
@@ -1290,7 +1707,7 @@ define void @matrix_mul_signed_and_double(i32 %N, ptr nocapture %C, ptr nocaptur
 ; CHECK-SD-NEXT:    mul v2.4s, v2.4s, v0.s[0]
 ; CHECK-SD-NEXT:    stp q1, q3, [x9]
 ; CHECK-SD-NEXT:    stp q2, q4, [x9, #32]
-; CHECK-SD-NEXT:    b.ne .LBB13_1
+; CHECK-SD-NEXT:    b.ne .LBB14_1
 ; CHECK-SD-NEXT:  // %bb.2: // %for.end12
 ; CHECK-SD-NEXT:    ret
 ;
@@ -1300,7 +1717,7 @@ define void @matrix_mul_signed_and_double(i32 %N, ptr nocapture %C, ptr nocaptur
 ; CHECK-GI-NEXT:    dup v0.4s, w8
 ; CHECK-GI-NEXT:    mov w8, w0
 ; CHECK-GI-NEXT:    and x8, x8, #0xfffffff0
-; CHECK-GI-NEXT:  .LBB13_1: // %vector.body
+; CHECK-GI-NEXT:  .LBB14_1: // %vector.body
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    add x9, x2, w0, uxtw #1
 ; CHECK-GI-NEXT:    subs x8, x8, #16
@@ -1318,7 +1735,7 @@ define void @matrix_mul_signed_and_double(i32 %N, ptr nocapture %C, ptr nocaptur
 ; CHECK-GI-NEXT:    mul v2.4s, v0.4s, v2.4s
 ; CHECK-GI-NEXT:    stp q3, q1, [x9]
 ; CHECK-GI-NEXT:    stp q4, q2, [x9, #32]!
-; CHECK-GI-NEXT:    b.ne .LBB13_1
+; CHECK-GI-NEXT:    b.ne .LBB14_1
 ; CHECK-GI-NEXT:  // %bb.2: // %for.end12
 ; CHECK-GI-NEXT:    ret
 vector.header:
@@ -1369,9 +1786,9 @@ define noundef <8 x i16> @cmplx_mul_combined_re_im(<8 x i16> noundef %a, i64 %sc
 ; CHECK-SD-LABEL: cmplx_mul_combined_re_im:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    lsr x9, x0, #16
-; CHECK-SD-NEXT:    adrp x8, .LCPI14_0
+; CHECK-SD-NEXT:    adrp x8, .LCPI15_0
 ; CHECK-SD-NEXT:    dup v4.8h, w0
-; CHECK-SD-NEXT:    ldr q3, [x8, :lo12:.LCPI14_0]
+; CHECK-SD-NEXT:    ldr q3, [x8, :lo12:.LCPI15_0]
 ; CHECK-SD-NEXT:    dup v2.8h, w9
 ; CHECK-SD-NEXT:    sqneg v1.8h, v2.8h
 ; CHECK-SD-NEXT:    tbl v1.16b, { v1.16b, v2.16b }, v3.16b
@@ -1386,12 +1803,12 @@ define noundef <8 x i16> @cmplx_mul_combined_re_im(<8 x i16> noundef %a, i64 %sc
 ; CHECK-GI-LABEL: cmplx_mul_combined_re_im:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    lsr w9, w0, #16
-; CHECK-GI-NEXT:    adrp x8, .LCPI14_0
+; CHECK-GI-NEXT:    adrp x8, .LCPI15_0
 ; CHECK-GI-NEXT:    rev32 v4.8h, v0.8h
 ; CHECK-GI-NEXT:    dup v1.8h, w9
 ; CHECK-GI-NEXT:    fmov s3, w9
 ; CHECK-GI-NEXT:    sqneg v2.8h, v1.8h
-; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI14_0]
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI15_0]
 ; CHECK-GI-NEXT:    tbl v1.16b, { v2.16b, v3.16b }, v1.16b
 ; CHECK-GI-NEXT:    mov d2, v0.d[1]
 ; CHECK-GI-NEXT:    dup v3.8h, w0
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index 6e5c666..0cd885e 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -222,22 +222,20 @@ define <4 x i32> @smull_zext_v4i16_v4i32(ptr %A, ptr %B) nounwind {
 define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind {
 ; CHECK-NEON-LABEL: smull_zext_v2i32_v2i64:
 ; CHECK-NEON:       // %bb.0:
-; CHECK-NEON-NEXT:    ldrh w8, [x0]
-; CHECK-NEON-NEXT:    ldrh w9, [x0, #2]
+; CHECK-NEON-NEXT:    ldrh w8, [x0, #2]
+; CHECK-NEON-NEXT:    ldr h0, [x0]
 ; CHECK-NEON-NEXT:    ldr d1, [x1]
-; CHECK-NEON-NEXT:    fmov d0, x8
-; CHECK-NEON-NEXT:    mov v0.d[1], x9
+; CHECK-NEON-NEXT:    mov v0.d[1], x8
 ; CHECK-NEON-NEXT:    xtn v0.2s, v0.2d
 ; CHECK-NEON-NEXT:    smull v0.2d, v0.2s, v1.2s
 ; CHECK-NEON-NEXT:    ret
 ;
 ; CHECK-SVE-LABEL: smull_zext_v2i32_v2i64:
 ; CHECK-SVE:       // %bb.0:
-; CHECK-SVE-NEXT:    ldrh w8, [x0]
-; CHECK-SVE-NEXT:    ldrh w9, [x0, #2]
+; CHECK-SVE-NEXT:    ldrh w8, [x0, #2]
+; CHECK-SVE-NEXT:    ldr h0, [x0]
 ; CHECK-SVE-NEXT:    ldr d1, [x1]
-; CHECK-SVE-NEXT:    fmov d0, x8
-; CHECK-SVE-NEXT:    mov v0.d[1], x9
+; CHECK-SVE-NEXT:    mov v0.d[1], x8
 ; CHECK-SVE-NEXT:    xtn v0.2s, v0.2d
 ; CHECK-SVE-NEXT:    smull v0.2d, v0.2s, v1.2s
 ; CHECK-SVE-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/arm64-cvt-simd-fptoi.ll b/llvm/test/CodeGen/AArch64/arm64-cvt-simd-fptoi.ll
new file mode 100644
index 0000000..a729772
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-cvt-simd-fptoi.ll
@@ -0,0 +1,1943 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple aarch64-unknown-unknown -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK-NOFPRCVT
+; RUN: llc < %s -mtriple aarch64-unknown-unknown -mattr=+fprcvt,+fullfp16 | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -mtriple aarch64-unknown-unknown -global-isel -global-isel-abort=2 -mattr=+fprcvt,+fullfp16 2>&1  | FileCheck %s --check-prefixes=CHECK
+
+; CHECK-GI: warning: Instruction selection used fallback path for fptosi_i32_f16_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptosi_i64_f16_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptosi_i64_f32_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptosi_i32_f64_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptosi_i64_f64_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptosi_i32_f32_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptoui_i32_f16_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptoui_i64_f16_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptoui_i64_f32_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptoui_i32_f64_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptoui_i64_f64_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptoui_i32_f32_simd
+
+;
+; FPTOI
+;
+
+define float @test_fptosi_f16_i32_simd(half %a)  {
+; CHECK-NOFPRCVT-LABEL: test_fptosi_f16_i32_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs w8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: test_fptosi_f16_i32_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs s0, h0
+; CHECK-NEXT:    ret
+  %r = fptosi half %a to i32
+  %bc = bitcast i32 %r to float
+  ret float %bc
+}
+
+define double @test_fptosi_f16_i64_simd(half %a)  {
+; CHECK-NOFPRCVT-LABEL: test_fptosi_f16_i64_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs x8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: test_fptosi_f16_i64_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs d0, h0
+; CHECK-NEXT:    ret
+  %r = fptosi half %a to i64
+  %bc = bitcast i64 %r to double
+  ret double %bc
+}
+
+define float @test_fptosi_f64_i32_simd(double %a)  {
+; CHECK-NOFPRCVT-LABEL: test_fptosi_f64_i32_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: test_fptosi_f64_i32_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs s0, d0
+; CHECK-NEXT:    ret
+  %r = fptosi double %a to i32
+  %bc = bitcast i32 %r to float
+  ret float %bc
+}
+
+define double @test_fptosi_f32_i64_simd(float %a)  {
+; CHECK-NOFPRCVT-LABEL: test_fptosi_f32_i64_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: test_fptosi_f32_i64_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs d0, s0
+; CHECK-NEXT:    ret
+  %r = fptosi float %a to i64
+  %bc = bitcast i64 %r to double
+  ret double %bc
+}
+
+define double @test_fptosi_f64_i64_simd(double %a)  {
+; CHECK-NOFPRCVT-LABEL: test_fptosi_f64_i64_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: test_fptosi_f64_i64_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs d0, d0
+; CHECK-NEXT:    ret
+  %r = fptosi double %a to i64
+  %bc = bitcast i64 %r to double
+  ret double %bc
+}
+
+
+define float @test_fptosi_f32_i32_simd(float %a)  {
+; CHECK-NOFPRCVT-LABEL: test_fptosi_f32_i32_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: test_fptosi_f32_i32_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs s0, s0
+; CHECK-NEXT:    ret
+  %r = fptosi float %a to i32
+  %bc = bitcast i32 %r to float
+  ret float %bc
+}
+
+define float @test_fptoui_f16_i32_simd(half %a)  {
+; CHECK-NOFPRCVT-LABEL: test_fptoui_f16_i32_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu w8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: test_fptoui_f16_i32_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu s0, h0
+; CHECK-NEXT:    ret
+  %r = fptoui half %a to i32
+  %bc = bitcast i32 %r to float
+  ret float %bc
+}
+
+define double @test_fptoui_f16_i64_simd(half %a)  {
+; CHECK-NOFPRCVT-LABEL: test_fptoui_f16_i64_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu x8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: test_fptoui_f16_i64_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu d0, h0
+; CHECK-NEXT:    ret
+  %r = fptoui half %a to i64
+  %bc = bitcast i64 %r to double
+  ret double %bc
+}
+
+define float @test_fptoui_f64_i32_simd(double %a)  {
+; CHECK-NOFPRCVT-LABEL: test_fptoui_f64_i32_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: test_fptoui_f64_i32_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu s0, d0
+; CHECK-NEXT:    ret
+  %r = fptoui double %a to i32
+  %bc = bitcast i32 %r to float
+  ret float %bc
+}
+
+define double @test_fptoui_f32_i64_simd(float %a)  {
+; CHECK-NOFPRCVT-LABEL: test_fptoui_f32_i64_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: test_fptoui_f32_i64_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu d0, s0
+; CHECK-NEXT:    ret
+  %r = fptoui float %a to i64
+  %bc = bitcast i64 %r to double
+  ret double %bc
+}
+
+define double @test_fptoui_f64_i64_simd(double %a)  {
+; CHECK-NOFPRCVT-LABEL: test_fptoui_f64_i64_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: test_fptoui_f64_i64_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu d0, d0
+; CHECK-NEXT:    ret
+  %r = fptoui double %a to i64
+  %bc = bitcast i64 %r to double
+  ret double %bc
+}
+
+
+define float @test_fptoui_f32_i32_simd(float %a)  {
+; CHECK-NOFPRCVT-LABEL: test_fptoui_f32_i32_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: test_fptoui_f32_i32_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu s0, s0
+; CHECK-NEXT:    ret
+  %r = fptoui float %a to i32
+  %bc = bitcast i32 %r to float
+  ret float %bc
+}
+
+
+;
+; FPTOI strictfp
+;
+
+define float @fptosi_i32_f16_simd(half %x)  {
+; CHECK-NOFPRCVT-LABEL: fptosi_i32_f16_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs w8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fptosi_i32_f16_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs s0, h0
+; CHECK-NEXT:    ret
+  %val = call i32 @llvm.experimental.constrained.fptosi.i32.f16(half %x, metadata !"fpexcept.strict")
+  %sum = bitcast i32 %val to float
+  ret float %sum
+}
+
+define double @fptosi_i64_f16_simd(half %x)  {
+; CHECK-NOFPRCVT-LABEL: fptosi_i64_f16_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs x8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fptosi_i64_f16_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs d0, h0
+; CHECK-NEXT:    ret
+  %val = call i64 @llvm.experimental.constrained.fptosi.i64.f16(half %x, metadata !"fpexcept.strict")
+  %sum = bitcast i64 %val to double
+  ret double %sum
+}
+
+define double @fptosi_i64_f32_simd(float %x)  {
+; CHECK-NOFPRCVT-LABEL: fptosi_i64_f32_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fptosi_i64_f32_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs d0, s0
+; CHECK-NEXT:    ret
+  %val = call i64 @llvm.experimental.constrained.fptosi.i64.f32(float %x, metadata !"fpexcept.strict")
+  %bc = bitcast i64 %val to double
+  ret double %bc
+}
+
+define float @fptosi_i32_f64_simd(double %x)  {
+; CHECK-NOFPRCVT-LABEL: fptosi_i32_f64_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fptosi_i32_f64_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs s0, d0
+; CHECK-NEXT:    ret
+  %val = call i32 @llvm.experimental.constrained.fptosi.i32.f64(double %x, metadata !"fpexcept.strict")
+  %bc = bitcast i32 %val to float
+  ret float %bc
+}
+
+define double @fptosi_i64_f64_simd(double %x)  {
+; CHECK-NOFPRCVT-LABEL: fptosi_i64_f64_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fptosi_i64_f64_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs d0, d0
+; CHECK-NEXT:    ret
+  %val = call i64 @llvm.experimental.constrained.fptosi.i64.f64(double %x, metadata !"fpexcept.strict")
+  %bc = bitcast i64 %val to double
+  ret double %bc
+}
+
+define float @fptosi_i32_f32_simd(float %x)  {
+; CHECK-NOFPRCVT-LABEL: fptosi_i32_f32_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fptosi_i32_f32_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs s0, s0
+; CHECK-NEXT:    ret
+  %val = call i32 @llvm.experimental.constrained.fptosi.i32.f32(float %x, metadata !"fpexcept.strict")
+  %bc = bitcast i32 %val to float
+  ret float %bc
+}
+
+
+
+define float @fptoui_i32_f16_simd(half %x)  {
+; CHECK-NOFPRCVT-LABEL: fptoui_i32_f16_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu w8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fptoui_i32_f16_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu s0, h0
+; CHECK-NEXT:    ret
+  %val = call i32 @llvm.experimental.constrained.fptoui.i32.f16(half %x, metadata !"fpexcept.strict")
+  %sum = bitcast i32 %val to float
+  ret float %sum
+}
+
+define double @fptoui_i64_f16_simd(half %x)  {
+; CHECK-NOFPRCVT-LABEL: fptoui_i64_f16_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu x8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fptoui_i64_f16_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu d0, h0
+; CHECK-NEXT:    ret
+  %val = call i64 @llvm.experimental.constrained.fptoui.i64.f16(half %x, metadata !"fpexcept.strict")
+  %sum = bitcast i64 %val to double
+  ret double %sum
+}
+
+define double @fptoui_i64_f32_simd(float %x)  {
+; CHECK-NOFPRCVT-LABEL: fptoui_i64_f32_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fptoui_i64_f32_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu d0, s0
+; CHECK-NEXT:    ret
+  %val = call i64 @llvm.experimental.constrained.fptoui.i64.f32(float %x, metadata !"fpexcept.strict")
+  %bc = bitcast i64 %val to double
+  ret double %bc
+}
+
+define float @fptoui_i32_f64_simd(double %x)  {
+; CHECK-NOFPRCVT-LABEL: fptoui_i32_f64_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fptoui_i32_f64_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu s0, d0
+; CHECK-NEXT:    ret
+  %val = call i32 @llvm.experimental.constrained.fptoui.i32.f64(double %x, metadata !"fpexcept.strict")
+  %bc = bitcast i32 %val to float
+  ret float %bc
+}
+
+define double @fptoui_i64_f64_simd(double %x)  {
+; CHECK-NOFPRCVT-LABEL: fptoui_i64_f64_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fptoui_i64_f64_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu d0, d0
+; CHECK-NEXT:    ret
+  %val = call i64 @llvm.experimental.constrained.fptoui.i64.f64(double %x, metadata !"fpexcept.strict")
+  %bc = bitcast i64 %val to double
+  ret double %bc
+}
+
+define float @fptoui_i32_f32_simd(float %x)  {
+; CHECK-NOFPRCVT-LABEL: fptoui_i32_f32_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fptoui_i32_f32_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu s0, s0
+; CHECK-NEXT:    ret
+  %val = call i32 @llvm.experimental.constrained.fptoui.i32.f32(float %x, metadata !"fpexcept.strict")
+  %bc = bitcast i32 %val to float
+  ret float %bc
+}
+
+;
+; FPTOI rounding
+;
+
+
+define double @fcvtas_ds_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_ds_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtas x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtas_ds_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtas d0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.round.f32(float %a)
+  %i = fptosi float %r to i64
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtas_sd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_sd_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtas w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtas_sd_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtas s0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.round.f64(double %a)
+  %i = fptosi double %r to i32
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define float @fcvtas_ss_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_ss_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtas s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtas_ss_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtas s0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.round.f32(float %a)
+  %i = fptosi float %r to i32
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtas_dd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_dd_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtas d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtas_dd_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtas d0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.round.f64(double %a)
+  %i = fptosi double %r to i64
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+
+define double @fcvtau_ds_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_ds_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtau x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtau_ds_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtau d0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.round.f32(float %a)
+  %i = fptoui float %r to i64
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtau_sd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_sd_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtau w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtau_sd_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtau s0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.round.f64(double %a)
+  %i = fptoui double %r to i32
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define float @fcvtau_ss_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_ss_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtas s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtau_ss_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtas s0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.round.f32(float %a)
+  %i = fptosi float %r to i32
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtau_dd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_dd_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtas d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtau_dd_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtas d0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.round.f64(double %a)
+  %i = fptosi double %r to i64
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+
+define double @fcvtms_ds_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_ds_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtms x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtms_ds_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtms d0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.floor.f32(float %a)
+  %i = fptosi float %r to i64
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtms_sd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_sd_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtms w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtms_sd_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtms s0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.floor.f64(double %a)
+  %i = fptosi double %r to i32
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define float @fcvtms_ss_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_ss_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtms s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtms_ss_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtms s0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.floor.f32(float %a)
+  %i = fptosi float %r to i32
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtms_dd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_dd_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtms d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtms_dd_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtms d0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.floor.f64(double %a)
+  %i = fptosi double %r to i64
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+
+
+define double @fcvtmu_ds_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_ds_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtmu x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtmu_ds_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtmu d0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.floor.f32(float %a)
+  %i = fptoui float %r to i64
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtmu_sd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_sd_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtmu w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtmu_sd_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtmu s0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.floor.f64(double %a)
+  %i = fptoui double %r to i32
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define float @fcvtmu_ss_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_ss_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtms s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtmu_ss_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtms s0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.floor.f32(float %a)
+  %i = fptosi float %r to i32
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtmu_dd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_dd_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtms d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtmu_dd_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtms d0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.floor.f64(double %a)
+  %i = fptosi double %r to i64
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+
+define double @fcvtps_ds_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_ds_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtps x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtps_ds_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtps d0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.ceil.f32(float %a)
+  %i = fptosi float %r to i64
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtps_sd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_sd_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtps w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtps_sd_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtps s0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.ceil.f64(double %a)
+  %i = fptosi double %r to i32
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define float @fcvtps_ss_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_ss_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtps s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtps_ss_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtps s0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.ceil.f32(float %a)
+  %i = fptosi float %r to i32
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtps_dd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_dd_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtps d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtps_dd_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtps d0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.ceil.f64(double %a)
+  %i = fptosi double %r to i64
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+
+define double @fcvtpu_ds_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_ds_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtpu x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtpu_ds_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtpu d0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.ceil.f32(float %a)
+  %i = fptoui float %r to i64
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtpu_sd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_sd_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtpu w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtpu_sd_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtpu s0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.ceil.f64(double %a)
+  %i = fptoui double %r to i32
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define float @fcvtpu_ss_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_ss_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtps s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtpu_ss_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtps s0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.ceil.f32(float %a)
+  %i = fptosi float %r to i32
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtpu_dd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_dd_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtps d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtpu_dd_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtps d0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.ceil.f64(double %a)
+  %i = fptosi double %r to i64
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+
+define double @fcvtzs_ds_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_ds_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzs_ds_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs d0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.trunc.f32(float %a)
+  %i = fptosi float %r to i64
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtzs_sd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_sd_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzs_sd_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs s0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.trunc.f64(double %a)
+  %i = fptosi double %r to i32
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define float @fcvtzs_ss_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_ss_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzs_ss_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs s0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.trunc.f32(float %a)
+  %i = fptosi float %r to i32
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtzs_dd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_dd_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzs_dd_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs d0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.trunc.f64(double %a)
+  %i = fptosi double %r to i64
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define double @fcvtzu_ds_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_ds_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzu_ds_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu d0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.trunc.f32(float %a)
+  %i = fptoui float %r to i64
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtzu_sd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_sd_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzu_sd_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu s0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.trunc.f64(double %a)
+  %i = fptoui double %r to i32
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define float @fcvtzu_ss_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_ss_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzu_ss_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs s0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.trunc.f32(float %a)
+  %i = fptosi float %r to i32
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtzu_dd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_dd_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzu_dd_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs d0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.trunc.f64(double %a)
+  %i = fptosi double %r to i64
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+
+;
+; FPTOI saturating
+;
+
+define float @fcvtzs_sh_sat_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_sh_sat_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs w8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzs_sh_sat_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs s0, h0
+; CHECK-NEXT:    ret
+  %i = call i32 @llvm.fptosi.sat.i32.f16(half %a)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtzs_dh_sat_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_dh_sat_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs x8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzs_dh_sat_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs d0, h0
+; CHECK-NEXT:    ret
+  %i = call i64 @llvm.fptosi.sat.i64.f16(half %a)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define double @fcvtzs_ds_sat_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_ds_sat_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzs_ds_sat_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs d0, s0
+; CHECK-NEXT:    ret
+  %i = call i64 @llvm.fptosi.sat.i64.f32(float %a)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtzs_sd_sat_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_sd_sat_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzs_sd_sat_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs s0, d0
+; CHECK-NEXT:    ret
+  %i = call i32 @llvm.fptosi.sat.i32.f64(double %a)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define float @fcvtzs_ss_sat_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_ss_sat_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzs_ss_sat_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs s0, s0
+; CHECK-NEXT:    ret
+  %i = call i32 @llvm.fptosi.sat.i32.f32(float %a)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtzs_dd_sat_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_dd_sat_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzs_dd_sat_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs d0, d0
+; CHECK-NEXT:    ret
+  %i = call i64 @llvm.fptosi.sat.i64.f64(double %a)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtzu_sh_sat_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_sh_sat_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu w8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzu_sh_sat_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu s0, h0
+; CHECK-NEXT:    ret
+  %i = call i32 @llvm.fptoui.sat.i32.f16(half %a)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtzu_dh_sat_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_dh_sat_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu x8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzu_dh_sat_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu d0, h0
+; CHECK-NEXT:    ret
+  %i = call i64 @llvm.fptoui.sat.i64.f16(half %a)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define double @fcvtzu_ds_sat_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_ds_sat_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzu_ds_sat_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu d0, s0
+; CHECK-NEXT:    ret
+  %i = call i64 @llvm.fptoui.sat.i64.f32(float %a)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtzu_sd_sat_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_sd_sat_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzu_sd_sat_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu s0, d0
+; CHECK-NEXT:    ret
+  %i = call i32 @llvm.fptoui.sat.i32.f64(double %a)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define float @fcvtzu_ss_sat_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_ss_sat_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzu_ss_sat_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs s0, s0
+; CHECK-NEXT:    ret
+  %i = call i32 @llvm.fptosi.sat.i32.f32(float %a)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtzu_dd_sat_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_dd_sat_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzu_dd_sat_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs d0, d0
+; CHECK-NEXT:    ret
+  %i = call i64 @llvm.fptosi.sat.i64.f64(double %a)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+;
+; FPTOI saturating with rounding
+;
+
+define float @fcvtas_sh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_sh_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtas w8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtas_sh_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtas s0, h0
+; CHECK-NEXT:    ret
+  %r = call half @llvm.round.f16(half %a) nounwind readnone
+  %i = call i32 @llvm.fptosi.sat.i32.f16(half %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtas_dh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_dh_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtas x8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtas_dh_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtas d0, h0
+; CHECK-NEXT:    ret
+  %r = call half @llvm.round.f16(half %a) nounwind readnone
+  %i = call i64 @llvm.fptosi.sat.i64.f16(half %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define double @fcvtas_ds_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_ds_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtas x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtas_ds_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtas d0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.round.f32(float %a)
+  %i = call i64 @llvm.fptosi.sat.i64.f32(float %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtas_sd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_sd_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtas w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtas_sd_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtas s0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.round.f64(double %a)
+  %i = call i32 @llvm.fptosi.sat.i32.f64(double %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define float @fcvtas_ss_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_ss_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtas s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtas_ss_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtas s0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.round.f32(float %a)
+  %i = call i32 @llvm.fptosi.sat.i32.f32(float %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtas_dd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_dd_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtas d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtas_dd_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtas d0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.round.f64(double %a)
+  %i = call i64 @llvm.fptosi.sat.i64.f64(double %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtau_sh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_sh_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtau w8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtau_sh_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtau s0, h0
+; CHECK-NEXT:    ret
+  %r = call half @llvm.round.f16(half %a) nounwind readnone
+  %i = call i32 @llvm.fptoui.sat.i32.f16(half %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtau_dh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_dh_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtau x8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtau_dh_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtau d0, h0
+; CHECK-NEXT:    ret
+  %r = call half @llvm.round.f16(half %a) nounwind readnone
+  %i = call i64 @llvm.fptoui.sat.i64.f16(half %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define double @fcvtau_ds_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_ds_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtau x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtau_ds_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtau d0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.round.f32(float %a)
+  %i = call i64 @llvm.fptoui.sat.i64.f32(float %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtau_sd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_sd_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtau w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtau_sd_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtau s0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.round.f64(double %a)
+  %i = call i32 @llvm.fptoui.sat.i32.f64(double %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define float @fcvtau_ss_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_ss_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtas s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtau_ss_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtas s0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.round.f32(float %a)
+  %i = call i32 @llvm.fptosi.sat.i32.f32(float %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtau_dd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_dd_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtas d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtau_dd_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtas d0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.round.f64(double %a)
+  %i = call i64 @llvm.fptosi.sat.i64.f64(double %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtms_sh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_sh_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtms w8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtms_sh_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtms s0, h0
+; CHECK-NEXT:    ret
+  %r = call half @llvm.floor.f16(half %a) nounwind readnone
+  %i = call i32 @llvm.fptosi.sat.i32.f16(half %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtms_dh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_dh_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtms x8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtms_dh_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtms d0, h0
+; CHECK-NEXT:    ret
+  %r = call half @llvm.floor.f16(half %a) nounwind readnone
+  %i = call i64 @llvm.fptosi.sat.i64.f16(half %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define double @fcvtms_ds_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_ds_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtms x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtms_ds_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtms d0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.floor.f32(float %a)
+  %i = call i64 @llvm.fptosi.sat.i64.f32(float %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtms_sd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_sd_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtms w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtms_sd_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtms s0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.floor.f64(double %a)
+  %i = call i32 @llvm.fptosi.sat.i32.f64(double %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define float @fcvtms_ss_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_ss_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtms s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtms_ss_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtms s0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.floor.f32(float %a)
+  %i = call i32 @llvm.fptosi.sat.i32.f32(float %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtms_dd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_dd_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtms d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtms_dd_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtms d0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.floor.f64(double %a)
+  %i = call i64 @llvm.fptosi.sat.i64.f64(double %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtmu_sh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_sh_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtmu w8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtmu_sh_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtmu s0, h0
+; CHECK-NEXT:    ret
+  %r = call half @llvm.floor.f16(half %a) nounwind readnone
+  %i = call i32 @llvm.fptoui.sat.i32.f16(half %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtmu_dh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_dh_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtmu x8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtmu_dh_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtmu d0, h0
+; CHECK-NEXT:    ret
+  %r = call half @llvm.floor.f16(half %a) nounwind readnone
+  %i = call i64 @llvm.fptoui.sat.i64.f16(half %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define double @fcvtmu_ds_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_ds_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtmu x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtmu_ds_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtmu d0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.floor.f32(float %a)
+  %i = call i64 @llvm.fptoui.sat.i64.f32(float %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtmu_sd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_sd_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtmu w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtmu_sd_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtmu s0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.floor.f64(double %a)
+  %i = call i32 @llvm.fptoui.sat.i32.f64(double %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define float @fcvtmu_ss_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_ss_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtms s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtmu_ss_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtms s0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.floor.f32(float %a)
+  %i = call i32 @llvm.fptosi.sat.i32.f32(float %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtmu_dd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_dd_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtms d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtmu_dd_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtms d0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.floor.f64(double %a)
+  %i = call i64 @llvm.fptosi.sat.i64.f64(double %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtps_sh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_sh_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtps w8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtps_sh_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtps s0, h0
+; CHECK-NEXT:    ret
+  %r = call half @llvm.ceil.f16(half %a) nounwind readnone
+  %i = call i32 @llvm.fptosi.sat.i32.f16(half %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtps_dh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_dh_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtps x8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtps_dh_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtps d0, h0
+; CHECK-NEXT:    ret
+  %r = call half @llvm.ceil.f16(half %a) nounwind readnone
+  %i = call i64 @llvm.fptosi.sat.i64.f16(half %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define double @fcvtps_ds_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_ds_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtps x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtps_ds_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtps d0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.ceil.f32(float %a)
+  %i = call i64 @llvm.fptosi.sat.i64.f32(float %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtps_sd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_sd_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtps w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtps_sd_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtps s0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.ceil.f64(double %a)
+  %i = call i32 @llvm.fptosi.sat.i32.f64(double %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define float @fcvtps_ss_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_ss_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtps s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtps_ss_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtps s0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.ceil.f32(float %a)
+  %i = call i32 @llvm.fptosi.sat.i32.f32(float %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtps_dd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_dd_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtps d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtps_dd_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtps d0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.ceil.f64(double %a)
+  %i = call i64 @llvm.fptosi.sat.i64.f64(double %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtpu_sh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_sh_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtpu w8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtpu_sh_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtpu s0, h0
+; CHECK-NEXT:    ret
+  %r = call half @llvm.ceil.f16(half %a) nounwind readnone
+  %i = call i32 @llvm.fptoui.sat.i32.f16(half %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtpu_dh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_dh_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtpu x8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtpu_dh_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtpu d0, h0
+; CHECK-NEXT:    ret
+  %r = call half @llvm.ceil.f16(half %a) nounwind readnone
+  %i = call i64 @llvm.fptoui.sat.i64.f16(half %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define double @fcvtpu_ds_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_ds_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtpu x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtpu_ds_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtpu d0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.ceil.f32(float %a)
+  %i = call i64 @llvm.fptoui.sat.i64.f32(float %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtpu_sd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_sd_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtpu w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtpu_sd_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtpu s0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.ceil.f64(double %a)
+  %i = call i32 @llvm.fptoui.sat.i32.f64(double %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define float @fcvtpu_ss_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_ss_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtps s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtpu_ss_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtps s0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.ceil.f32(float %a)
+  %i = call i32 @llvm.fptosi.sat.i32.f32(float %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtpu_dd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_dd_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtps d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtpu_dd_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtps d0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.ceil.f64(double %a)
+  %i = call i64 @llvm.fptosi.sat.i64.f64(double %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtzs_sh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_sh_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs w8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzs_sh_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs s0, h0
+; CHECK-NEXT:    ret
+  %r = call half @llvm.trunc.f16(half %a) nounwind readnone
+  %i = call i32 @llvm.fptosi.sat.i32.f16(half %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtzs_dh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_dh_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs x8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzs_dh_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs d0, h0
+; CHECK-NEXT:    ret
+  %r = call half @llvm.trunc.f16(half %a) nounwind readnone
+  %i = call i64 @llvm.fptosi.sat.i64.f16(half %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define double @fcvtzs_ds_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_ds_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzs_ds_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs d0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.trunc.f32(float %a)
+  %i = call i64 @llvm.fptosi.sat.i64.f32(float %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtzs_sd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_sd_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzs_sd_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs s0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.trunc.f64(double %a)
+  %i = call i32 @llvm.fptosi.sat.i32.f64(double %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define float @fcvtzs_ss_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_ss_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzs_ss_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs s0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.trunc.f32(float %a)
+  %i = call i32 @llvm.fptosi.sat.i32.f32(float %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtzs_dd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_dd_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzs_dd_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs d0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.trunc.f64(double %a)
+  %i = call i64 @llvm.fptosi.sat.i64.f64(double %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtzu_sh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_sh_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu w8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzu_sh_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu s0, h0
+; CHECK-NEXT:    ret
+  %r = call half @llvm.trunc.f16(half %a) nounwind readnone
+  %i = call i32 @llvm.fptoui.sat.i32.f16(half %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtzu_dh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_dh_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu x8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzu_dh_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu d0, h0
+; CHECK-NEXT:    ret
+  %r = call half @llvm.trunc.f16(half %a) nounwind readnone
+  %i = call i64 @llvm.fptoui.sat.i64.f16(half %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define double @fcvtzu_ds_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_ds_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzu_ds_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu d0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.trunc.f32(float %a)
+  %i = call i64 @llvm.fptoui.sat.i64.f32(float %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtzu_sd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_sd_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzu_sd_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu s0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.trunc.f64(double %a)
+  %i = call i32 @llvm.fptoui.sat.i32.f64(double %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define float @fcvtzu_ss_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_ss_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzu_ss_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu s0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.trunc.f32(float %a)
+  %i = call i32 @llvm.fptoui.sat.i32.f32(float %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtzu_dd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_dd_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzu_dd_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu d0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.trunc.f64(double %a)
+  %i = call i64 @llvm.fptoui.sat.i64.f64(double %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
index e18a5f6..d8f3708 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -980,12 +980,18 @@ define <1 x double> @test_bitcasti64tov1f64(i64 %in) {
 }
 
 define <1 x i64> @test_bitcastv8i8tov1f64(<8 x i8> %a) #0 {
-; CHECK-LABEL: test_bitcastv8i8tov1f64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg v0.8b, v0.8b
-; CHECK-NEXT:    fcvtzs x8, d0
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_bitcastv8i8tov1f64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    neg v0.8b, v0.8b
+; CHECK-SD-NEXT:    fcvtzs x8, d0
+; CHECK-SD-NEXT:    fmov d0, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_bitcastv8i8tov1f64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    neg v0.8b, v0.8b
+; CHECK-GI-NEXT:    fcvtzs d0, d0
+; CHECK-GI-NEXT:    ret
   %sub.i = sub <8 x i8> zeroinitializer, %a
   %1 = bitcast <8 x i8> %sub.i to <1 x double>
   %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
@@ -993,12 +999,18 @@ define <1 x i64> @test_bitcastv8i8tov1f64(<8 x i8> %a) #0 {
 }
 
 define <1 x i64> @test_bitcastv4i16tov1f64(<4 x i16> %a) #0 {
-; CHECK-LABEL: test_bitcastv4i16tov1f64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg v0.4h, v0.4h
-; CHECK-NEXT:    fcvtzs x8, d0
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_bitcastv4i16tov1f64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    neg v0.4h, v0.4h
+; CHECK-SD-NEXT:    fcvtzs x8, d0
+; CHECK-SD-NEXT:    fmov d0, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_bitcastv4i16tov1f64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    neg v0.4h, v0.4h
+; CHECK-GI-NEXT:    fcvtzs d0, d0
+; CHECK-GI-NEXT:    ret
   %sub.i = sub <4 x i16> zeroinitializer, %a
   %1 = bitcast <4 x i16> %sub.i to <1 x double>
   %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
@@ -1006,12 +1018,18 @@ define <1 x i64> @test_bitcastv4i16tov1f64(<4 x i16> %a) #0 {
 }
 
 define <1 x i64> @test_bitcastv2i32tov1f64(<2 x i32> %a) #0 {
-; CHECK-LABEL: test_bitcastv2i32tov1f64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg v0.2s, v0.2s
-; CHECK-NEXT:    fcvtzs x8, d0
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_bitcastv2i32tov1f64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    neg v0.2s, v0.2s
+; CHECK-SD-NEXT:    fcvtzs x8, d0
+; CHECK-SD-NEXT:    fmov d0, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_bitcastv2i32tov1f64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    neg v0.2s, v0.2s
+; CHECK-GI-NEXT:    fcvtzs d0, d0
+; CHECK-GI-NEXT:    ret
   %sub.i = sub <2 x i32> zeroinitializer, %a
   %1 = bitcast <2 x i32> %sub.i to <1 x double>
   %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
@@ -1031,8 +1049,7 @@ define <1 x i64> @test_bitcastv1i64tov1f64(<1 x i64> %a) #0 {
 ; CHECK-GI-NEXT:    fmov x8, d0
 ; CHECK-GI-NEXT:    neg x8, x8
 ; CHECK-GI-NEXT:    fmov d0, x8
-; CHECK-GI-NEXT:    fcvtzs x8, d0
-; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    fcvtzs d0, d0
 ; CHECK-GI-NEXT:    ret
   %sub.i = sub <1 x i64> zeroinitializer, %a
   %1 = bitcast <1 x i64> %sub.i to <1 x double>
diff --git a/llvm/test/CodeGen/AArch64/arm64-vcvt.ll b/llvm/test/CodeGen/AArch64/arm64-vcvt.ll
index 627d31f..1e0cfa0 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vcvt.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vcvt.ll
@@ -359,11 +359,16 @@ define <2 x i64> @fcvtzs_2d(<2 x double> %A) nounwind {
 
 ; FIXME: Generate "fcvtzs d0, d0"?
 define <1 x i64> @fcvtzs_1d(<1 x double> %A) nounwind {
-; CHECK-LABEL: fcvtzs_1d:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fcvtzs x8, d0
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: fcvtzs_1d:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fcvtzs x8, d0
+; CHECK-SD-NEXT:    fmov d0, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fcvtzs_1d:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fcvtzs d0, d0
+; CHECK-GI-NEXT:    ret
 	%tmp3 = fptosi <1 x double> %A to <1 x i64>
 	ret <1 x i64> %tmp3
 }
@@ -438,11 +443,16 @@ define <2 x i64> @fcvtzu_2d(<2 x double> %A) nounwind {
 
 ; FIXME: Generate "fcvtzu d0, d0"?
 define <1 x i64> @fcvtzu_1d(<1 x double> %A) nounwind {
-; CHECK-LABEL: fcvtzu_1d:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fcvtzu x8, d0
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: fcvtzu_1d:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fcvtzu x8, d0
+; CHECK-SD-NEXT:    fmov d0, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fcvtzu_1d:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fcvtzu d0, d0
+; CHECK-GI-NEXT:    ret
 	%tmp3 = fptoui <1 x double> %A to <1 x i64>
 	ret <1 x i64> %tmp3
 }
diff --git a/llvm/test/CodeGen/AArch64/dup-ext-load-combine.ll b/llvm/test/CodeGen/AArch64/dup-ext-load-combine.ll
new file mode 100644
index 0000000..cf52934
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/dup-ext-load-combine.ll
@@ -0,0 +1,178 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+
+; Test optimization of DUP with extended narrow loads
+; This should avoid GPR->SIMD transfers by loading directly into vector registers
+
+define <4 x i16> @test_dup_zextload_i8_v4i16(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    dup v0.4h, v0.h[0]
+; CHECK-NEXT:    ret
+  %load = load i8, ptr %p, align 1
+  %ext = zext i8 %load to i16
+  %vec = insertelement <4 x i16> poison, i16 %ext, i32 0
+  %dup = shufflevector <4 x i16> %vec, <4 x i16> poison, <4 x i32> zeroinitializer
+  ret <4 x i16> %dup
+}
+
+define <8 x i16> @test_dup_zextload_i8_v8i16(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    ret
+  %load = load i8, ptr %p, align 1
+  %ext = zext i8 %load to i16
+  %vec = insertelement <8 x i16> poison, i16 %ext, i32 0
+  %dup = shufflevector <8 x i16> %vec, <8 x i16> poison, <8 x i32> zeroinitializer
+  ret <8 x i16> %dup
+}
+
+define <2 x i32> @test_dup_zextload_i8_v2i32(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    dup v0.2s, v0.s[0]
+; CHECK-NEXT:    ret
+  %load = load i8, ptr %p, align 1
+  %ext = zext i8 %load to i32
+  %vec = insertelement <2 x i32> poison, i32 %ext, i32 0
+  %dup = shufflevector <2 x i32> %vec, <2 x i32> poison, <2 x i32> zeroinitializer
+  ret <2 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i8_v4i32(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+  %load = load i8, ptr %p, align 1
+  %ext = zext i8 %load to i32
+  %vec = insertelement <4 x i32> poison, i32 %ext, i32 0
+  %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
+  ret <4 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i8_v4i32_offset(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v4i32_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0, #4]
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+  %addr = getelementptr inbounds i8, ptr %p, i64 4
+  %load = load i8, ptr %addr, align 1
+  %ext = zext i8 %load to i32
+  %vec = insertelement <4 x i32> poison, i32 %ext, i32 0
+  %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
+  ret <4 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i8_v4i32_reg_offset(ptr %p, i64 %offset) {
+; CHECK-LABEL: test_dup_zextload_i8_v4i32_reg_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0, x1]
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+  %addr = getelementptr inbounds i8, ptr %p, i64 %offset
+  %load = load i8, ptr %addr, align 1
+  %ext = zext i8 %load to i32
+  %vec = insertelement <4 x i32> poison, i32 %ext, i32 0
+  %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
+  ret <4 x i32> %dup
+}
+
+define <2 x i64> @test_dup_zextload_i8_v2i64(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    dup v0.2d, v0.d[0]
+; CHECK-NEXT:    ret
+  %load = load i8, ptr %p, align 1
+  %ext = zext i8 %load to i64
+  %vec = insertelement <2 x i64> poison, i64 %ext, i32 0
+  %dup = shufflevector <2 x i64> %vec, <2 x i64> poison, <2 x i32> zeroinitializer
+  ret <2 x i64> %dup
+}
+
+define <2 x i32> @test_dup_zextload_i16_v2i32(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i16_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    dup v0.2s, v0.s[0]
+; CHECK-NEXT:    ret
+  %load = load i16, ptr %p, align 1
+  %ext = zext i16 %load to i32
+  %vec = insertelement <2 x i32> poison, i32 %ext, i32 0
+  %dup = shufflevector <2 x i32> %vec, <2 x i32> poison, <2 x i32> zeroinitializer
+  ret <2 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i16_v4i32(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i16_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+  %load = load i16, ptr %p, align 1
+  %ext = zext i16 %load to i32
+  %vec = insertelement <4 x i32> poison, i32 %ext, i32 0
+  %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
+  ret <4 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i16_v4i32_offset(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i16_v4i32_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, #8]
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+  %addr = getelementptr inbounds i16, ptr %p, i64 4
+  %load = load i16, ptr %addr, align 1
+  %ext = zext i16 %load to i32
+  %vec = insertelement <4 x i32> poison, i32 %ext, i32 0
+  %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
+  ret <4 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i16_v4i32_reg_offset(ptr %p, i64 %offset) {
+; CHECK-LABEL: test_dup_zextload_i16_v4i32_reg_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, x1, lsl #1]
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+  %addr = getelementptr inbounds i16, ptr %p, i64 %offset
+  %load = load i16, ptr %addr, align 1
+  %ext = zext i16 %load to i32
+  %vec = insertelement <4 x i32> poison, i32 %ext, i32 0
+  %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
+  ret <4 x i32> %dup
+}
+
+define <2 x i64> @test_dup_zextload_i16_v2i64(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i16_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    dup v0.2d, v0.d[0]
+; CHECK-NEXT:    ret
+  %load = load i16, ptr %p, align 1
+  %ext = zext i16 %load to i64
+  %vec = insertelement <2 x i64> poison, i64 %ext, i32 0
+  %dup = shufflevector <2 x i64> %vec, <2 x i64> poison, <2 x i32> zeroinitializer
+  ret <2 x i64> %dup
+}
+
+define <2 x i64> @test_dup_zextload_i32_v2i64(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i32_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    dup v0.2d, v0.d[0]
+; CHECK-NEXT:    ret
+  %load = load i32, ptr %p, align 1
+  %ext = zext i32 %load to i64
+  %vec = insertelement <2 x i64> poison, i64 %ext, i32 0
+  %dup = shufflevector <2 x i64> %vec, <2 x i64> poison, <2 x i32> zeroinitializer
+  ret <2 x i64> %dup
+}
diff --git a/llvm/test/CodeGen/AArch64/dup.ll b/llvm/test/CodeGen/AArch64/dup.ll
index 079ff10..670574f2 100644
--- a/llvm/test/CodeGen/AArch64/dup.ll
+++ b/llvm/test/CodeGen/AArch64/dup.ll
@@ -32,8 +32,8 @@ entry:
 define <2 x i8> @loaddup_v2i8(ptr %p) {
 ; CHECK-LABEL: loaddup_v2i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldrb w8, [x0]
-; CHECK-NEXT:    dup v0.2s, w8
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    dup v0.2s, v0.s[0]
 ; CHECK-NEXT:    ret
 entry:
   %a = load i8, ptr %p
@@ -189,8 +189,8 @@ entry:
 define <4 x i8> @loaddup_v4i8(ptr %p) {
 ; CHECK-SD-LABEL: loaddup_v4i8:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldrb w8, [x0]
-; CHECK-SD-NEXT:    dup v0.4h, w8
+; CHECK-SD-NEXT:    ldr b0, [x0]
+; CHECK-SD-NEXT:    dup v0.4h, v0.h[0]
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: loaddup_v4i8:
@@ -444,8 +444,8 @@ entry:
 define <2 x i16> @loaddup_v2i16(ptr %p) {
 ; CHECK-SD-LABEL: loaddup_v2i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldrh w8, [x0]
-; CHECK-SD-NEXT:    dup v0.2s, w8
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    dup v0.2s, v0.s[0]
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: loaddup_v2i16:
diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
index c741129..b963acd 100644
--- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
@@ -31,8 +31,7 @@ define <1 x i32> @test_signed_v1f32_v1i32(<1 x float> %f) {
 ;
 ; CHECK-GI-LABEL: test_signed_v1f32_v1i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fcvtzs w8, s0
-; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    fcvtzs s0, s0
 ; CHECK-GI-NEXT:    ret
     %x = call <1 x i32> @llvm.fptosi.sat.v1f32.v1i32(<1 x float> %f)
     ret <1 x i32> %x
@@ -1162,18 +1161,24 @@ declare <7 x i32> @llvm.fptosi.sat.v7f16.v7i32 (<7 x half>)
 declare <8 x i32> @llvm.fptosi.sat.v8f16.v8i32 (<8 x half>)
 
 define <1 x i32> @test_signed_v1f16_v1i32(<1 x half> %f) {
-; CHECK-CVT-LABEL: test_signed_v1f16_v1i32:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvt s0, h0
-; CHECK-CVT-NEXT:    fcvtzs w8, s0
-; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    ret
+; CHECK-SD-CVT-LABEL: test_signed_v1f16_v1i32:
+; CHECK-SD-CVT:       // %bb.0:
+; CHECK-SD-CVT-NEXT:    fcvt s0, h0
+; CHECK-SD-CVT-NEXT:    fcvtzs w8, s0
+; CHECK-SD-CVT-NEXT:    fmov s0, w8
+; CHECK-SD-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_signed_v1f16_v1i32:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcvtzs w8, h0
 ; CHECK-FP16-NEXT:    fmov s0, w8
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-GI-CVT-LABEL: test_signed_v1f16_v1i32:
+; CHECK-GI-CVT:       // %bb.0:
+; CHECK-GI-CVT-NEXT:    fcvt s0, h0
+; CHECK-GI-CVT-NEXT:    fcvtzs s0, s0
+; CHECK-GI-CVT-NEXT:    ret
     %x = call <1 x i32> @llvm.fptosi.sat.v1f16.v1i32(<1 x half> %f)
     ret <1 x i32> %x
 }
diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
index efe0a1b..5a66b68 100644
--- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
@@ -31,8 +31,7 @@ define <1 x i32> @test_unsigned_v1f32_v1i32(<1 x float> %f) {
 ;
 ; CHECK-GI-LABEL: test_unsigned_v1f32_v1i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fcvtzu w8, s0
-; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    fcvtzu s0, s0
 ; CHECK-GI-NEXT:    ret
     %x = call <1 x i32> @llvm.fptoui.sat.v1f32.v1i32(<1 x float> %f)
     ret <1 x i32> %x
@@ -993,18 +992,24 @@ declare <7 x i32> @llvm.fptoui.sat.v7f16.v7i32 (<7 x half>)
 declare <8 x i32> @llvm.fptoui.sat.v8f16.v8i32 (<8 x half>)
 
 define <1 x i32> @test_unsigned_v1f16_v1i32(<1 x half> %f) {
-; CHECK-CVT-LABEL: test_unsigned_v1f16_v1i32:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvt s0, h0
-; CHECK-CVT-NEXT:    fcvtzu w8, s0
-; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    ret
+; CHECK-SD-CVT-LABEL: test_unsigned_v1f16_v1i32:
+; CHECK-SD-CVT:       // %bb.0:
+; CHECK-SD-CVT-NEXT:    fcvt s0, h0
+; CHECK-SD-CVT-NEXT:    fcvtzu w8, s0
+; CHECK-SD-CVT-NEXT:    fmov s0, w8
+; CHECK-SD-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_unsigned_v1f16_v1i32:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcvtzu w8, h0
 ; CHECK-FP16-NEXT:    fmov s0, w8
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-GI-CVT-LABEL: test_unsigned_v1f16_v1i32:
+; CHECK-GI-CVT:       // %bb.0:
+; CHECK-GI-CVT-NEXT:    fcvt s0, h0
+; CHECK-GI-CVT-NEXT:    fcvtzu s0, s0
+; CHECK-GI-CVT-NEXT:    ret
     %x = call <1 x i32> @llvm.fptoui.sat.v1f16.v1i32(<1 x half> %f)
     ret <1 x i32> %x
 }
diff --git a/llvm/test/CodeGen/AArch64/ldst-prepost-uses.ll b/llvm/test/CodeGen/AArch64/ldst-prepost-uses.ll
new file mode 100644
index 0000000..85991fb
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ldst-prepost-uses.ll
@@ -0,0 +1,73 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -O3 -mtriple=aarch64 | FileCheck %s
+
+; From #164775, this generates a pre-index load feeding a post-index store, that
+; was checking the wrong uses for post-inc. It seems quite delicate for it to
+; generate this combination at the wrong point to hit the same issue.
+
+@g_260 = dso_local global i16 0
+@g_480 = dso_local global i16 0
+
+define i32 @func_1(ptr %l_3253) {
+; CHECK-LABEL: func_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #128
+; CHECK-NEXT:    .cfi_def_cfa_offset 128
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    mov w9, #2 // =0x2
+; CHECK-NEXT:    mov w10, #96 // =0x60
+; CHECK-NEXT:    strb wzr, [x9]
+; CHECK-NEXT:    mov w9, #111 // =0x6f
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    str wzr, [x9]
+; CHECK-NEXT:    mov w9, #80 // =0x50
+; CHECK-NEXT:    adrp x1, .L_MergedGlobals
+; CHECK-NEXT:    add x1, x1, :lo12:.L_MergedGlobals
+; CHECK-NEXT:    strh wzr, [x8]
+; CHECK-NEXT:    str q0, [x9]
+; CHECK-NEXT:    mov w9, #48 // =0x30
+; CHECK-NEXT:    str q0, [x9]
+; CHECK-NEXT:    mov w9, #32 // =0x20
+; CHECK-NEXT:    str q0, [x10]
+; CHECK-NEXT:    mov w10, #64 // =0x40
+; CHECK-NEXT:    str q0, [x9]
+; CHECK-NEXT:    mov w9, #16 // =0x10
+; CHECK-NEXT:    str q0, [x10]
+; CHECK-NEXT:    str q0, [x9]
+; CHECK-NEXT:    str q0, [x8]
+; CHECK-NEXT:    adrp x8, .L_MergedGlobals
+; CHECK-NEXT:    strb wzr, [x0, #8]
+; CHECK-NEXT:    strb wzr, [x0, #12]
+; CHECK-NEXT:    strb wzr, [x0, #16]
+; CHECK-NEXT:    strb wzr, [x0, #20]
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ldrh wzr, [x8, :lo12:.L_MergedGlobals]
+; CHECK-NEXT:    ldrh w8, [x1, #4]!
+; CHECK-NEXT:    sub w8, w8, #1
+; CHECK-NEXT:    strh w8, [x1]
+; CHECK-NEXT:    add sp, sp, #128
+; CHECK-NEXT:    b use
+entry:
+  %l_32531.sroa.3 = alloca [3 x i8], align 4
+  %l_32531.sroa.4 = alloca [115 x i8], align 4
+  call void @llvm.lifetime.start.p0(ptr %l_32531.sroa.3)
+  call void @llvm.lifetime.start.p0(ptr %l_32531.sroa.4)
+  call void @llvm.memset.p0.i64(ptr null, i8 0, i64 3, i1 false)
+  call void @llvm.memset.p0.i64(ptr null, i8 0, i64 115, i1 false)
+  %0 = getelementptr inbounds i8, ptr %l_3253, i64 8
+  store i8 0, ptr %0, align 4
+  %1 = getelementptr inbounds i8, ptr %l_3253, i64 12
+  store i8 0, ptr %1, align 4
+  %2 = getelementptr inbounds i8, ptr %l_3253, i64 16
+  store i8 0, ptr %2, align 4
+  %3 = getelementptr inbounds i8, ptr %l_3253, i64 20
+  store i8 0, ptr %3, align 4
+  %4 = load volatile i16, ptr @g_260, align 4
+  %5 = load i16, ptr @g_480, align 4
+  %dec.i.i = add i16 %5, -1
+  store i16 %dec.i.i, ptr @g_480, align 4
+  %call1 = tail call i32 @use(i32 0, ptr @g_480)
+  ret i32 %call1
+}
+
+declare i32 @use(i32, ptr)
diff --git a/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll b/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll
index 9193025..628506b 100644
--- a/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll
@@ -84,8 +84,7 @@ entry:
 define double @load_u64_from_u32_off1(ptr %n){
 ; CHECK-LABEL: load_u64_from_u32_off1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldur w8, [x0, #1]
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ldur s0, [x0, #1]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 1
@@ -98,8 +97,7 @@ entry:
 define double @load_u64_from_u16_off1(ptr %n){
 ; CHECK-LABEL: load_u64_from_u16_off1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldurh w8, [x0, #1]
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ldur h0, [x0, #1]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 1
@@ -112,8 +110,7 @@ entry:
 define double @load_u64_from_u8_off1(ptr %n){
 ; CHECK-LABEL: load_u64_from_u8_off1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldrb w8, [x0, #1]
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ldr b0, [x0, #1]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 1
@@ -126,8 +123,7 @@ entry:
 define float @load_u32_from_u16_off1(ptr %n){
 ; CHECK-LABEL: load_u32_from_u16_off1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldurh w8, [x0, #1]
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ldur h0, [x0, #1]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 1
@@ -140,8 +136,7 @@ entry:
 define float @load_u32_from_u8_off1(ptr %n){
 ; CHECK-LABEL: load_u32_from_u8_off1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldrb w8, [x0, #1]
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ldr b0, [x0, #1]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 1
@@ -154,8 +149,7 @@ entry:
 define half @load_u16_from_u8_off1(ptr %n){
 ; CHECK-LABEL: load_u16_from_u8_off1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldrb w8, [x0, #1]
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ldr b0, [x0, #1]
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-NEXT:    ret
 entry:
@@ -171,8 +165,7 @@ entry:
 define double @load_u64_from_u32_off2(ptr %n){
 ; CHECK-LABEL: load_u64_from_u32_off2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldur w8, [x0, #2]
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ldur s0, [x0, #2]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 2
@@ -185,8 +178,7 @@ entry:
 define double @load_u64_from_u16_off2(ptr %n){
 ; CHECK-LABEL: load_u64_from_u16_off2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldrh w8, [x0, #2]
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ldr h0, [x0, #2]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 2
@@ -199,8 +191,7 @@ entry:
 define double @load_u64_from_u8_off2(ptr %n){
 ; CHECK-LABEL: load_u64_from_u8_off2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldrb w8, [x0, #2]
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ldr b0, [x0, #2]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 2
@@ -226,7 +217,7 @@ entry:
 define float @load_u32_from_u8_off2(ptr %n){
 ; CHECK-LABEL: load_u32_from_u8_off2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr b0, [x0, #1]
+; CHECK-NEXT:    ldr b0, [x0, #2]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 2
@@ -239,7 +230,7 @@ entry:
 define half @load_u16_from_u8_off2(ptr %n){
 ; CHECK-LABEL: load_u16_from_u8_off2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr b0, [x0, #1]
+; CHECK-NEXT:    ldr b0, [x0, #2]
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-NEXT:    ret
 entry:
@@ -255,8 +246,7 @@ entry:
 define double @load_u64_from_u32_off255(ptr %n){
 ; CHECK-LABEL: load_u64_from_u32_off255:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldur w8, [x0, #255]
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ldur s0, [x0, #255]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 255
@@ -269,8 +259,7 @@ entry:
 define double @load_u64_from_u16_off255(ptr %n){
 ; CHECK-LABEL: load_u64_from_u16_off255:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldurh w8, [x0, #255]
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ldur h0, [x0, #255]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 255
@@ -283,8 +272,7 @@ entry:
 define double @load_u64_from_u8_off255(ptr %n){
 ; CHECK-LABEL: load_u64_from_u8_off255:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldrb w8, [x0, #255]
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ldr b0, [x0, #255]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 255
@@ -297,8 +285,7 @@ entry:
 define float @load_u32_from_u16_off255(ptr %n){
 ; CHECK-LABEL: load_u32_from_u16_off255:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldurh w8, [x0, #255]
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ldur h0, [x0, #255]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 255
@@ -311,8 +298,7 @@ entry:
 define float @load_u32_from_u8_off255(ptr %n){
 ; CHECK-LABEL: load_u32_from_u8_off255:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldrb w8, [x0, #255]
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ldr b0, [x0, #255]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 255
@@ -325,8 +311,7 @@ entry:
 define half @load_u16_from_u8_off255(ptr %n){
 ; CHECK-LABEL: load_u16_from_u8_off255:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldrb w8, [x0, #255]
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ldr b0, [x0, #255]
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-NEXT:    ret
 entry:
@@ -354,7 +339,7 @@ entry:
 define double @load_u64_from_u16_off256(ptr %n){
 ; CHECK-LABEL: load_u64_from_u16_off256:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr h0, [x0, #128]
+; CHECK-NEXT:    ldr h0, [x0, #256]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 256
@@ -367,7 +352,7 @@ entry:
 define double @load_u64_from_u8_off256(ptr %n){
 ; CHECK-LABEL: load_u64_from_u8_off256:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr b0, [x0, #64]
+; CHECK-NEXT:    ldr b0, [x0, #256]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 256
@@ -393,7 +378,7 @@ entry:
 define float @load_u32_from_u8_off256(ptr %n){
 ; CHECK-LABEL: load_u32_from_u8_off256:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr b0, [x0, #128]
+; CHECK-NEXT:    ldr b0, [x0, #256]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 256
@@ -406,7 +391,7 @@ entry:
 define half @load_u16_from_u8_off256(ptr %n){
 ; CHECK-LABEL: load_u16_from_u8_off256:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr b0, [x0, #128]
+; CHECK-NEXT:    ldr b0, [x0, #256]
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-NEXT:    ret
 entry:
@@ -435,8 +420,7 @@ entry:
 define double @load_u64_from_u16_offn(ptr %n){
 ; CHECK-LABEL: load_u64_from_u16_offn:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, #8190 // =0x1ffe
-; CHECK-NEXT:    ldr h0, [x0, x8]
+; CHECK-NEXT:    ldr h0, [x0, #8190]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 8190
@@ -503,8 +487,8 @@ entry:
 define double @load_u64_from_u32_offnp1(ptr %n){
 ; CHECK-LABEL: load_u64_from_u32_offnp1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add x8, x0, #4, lsl #12 // =16384
-; CHECK-NEXT:    ldr s0, [x8]
+; CHECK-NEXT:    mov w8, #16384 // =0x4000
+; CHECK-NEXT:    ldr s0, [x0, x8]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 16384
@@ -517,7 +501,8 @@ entry:
 define double @load_u64_from_u16_offnp1(ptr %n){
 ; CHECK-LABEL: load_u64_from_u16_offnp1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr h0, [x0, #4096]
+; CHECK-NEXT:    mov w8, #8192 // =0x2000
+; CHECK-NEXT:    ldr h0, [x0, x8]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 8192
@@ -530,7 +515,8 @@ entry:
 define double @load_u64_from_u8_offnp1(ptr %n){
 ; CHECK-LABEL: load_u64_from_u8_offnp1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr b0, [x0, #1024]
+; CHECK-NEXT:    mov w8, #4096 // =0x1000
+; CHECK-NEXT:    ldr b0, [x0, x8]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 4096
@@ -543,8 +529,8 @@ entry:
 define float @load_u32_from_u16_offnp1(ptr %n){
 ; CHECK-LABEL: load_u32_from_u16_offnp1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add x8, x0, #2, lsl #12 // =8192
-; CHECK-NEXT:    ldr h0, [x8]
+; CHECK-NEXT:    mov w8, #8192 // =0x2000
+; CHECK-NEXT:    ldr h0, [x0, x8]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 8192
@@ -557,7 +543,8 @@ entry:
 define float @load_u32_from_u8_offnp1(ptr %n){
 ; CHECK-LABEL: load_u32_from_u8_offnp1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr b0, [x0, #2048]
+; CHECK-NEXT:    mov w8, #4096 // =0x1000
+; CHECK-NEXT:    ldr b0, [x0, x8]
 ; CHECK-NEXT:    ret
 entry:
   %p = getelementptr i8, ptr %n, i64 4096
@@ -570,7 +557,8 @@ entry:
 define half @load_u16_from_u8_offnp1(ptr %n){
 ; CHECK-LABEL: load_u16_from_u8_offnp1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr b0, [x0, #2048]
+; CHECK-NEXT:    mov w8, #4096 // =0x1000
+; CHECK-NEXT:    ldr b0, [x0, x8]
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/pr164181.ll b/llvm/test/CodeGen/AArch64/pr164181.ll
new file mode 100644
index 0000000..4ec63ec
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/pr164181.ll
@@ -0,0 +1,640 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -verify-machineinstrs < %s | FileCheck %s
+
+; This test recreates a regalloc crash reported in
+; https://github.com/llvm/llvm-project/issues/164181
+; When rematting an instruction we need to make sure to constrain the newly
+; allocated register to both the rematted def's reg class and the use's reg
+; class.
+
+target triple = "aarch64-unknown-linux-gnu"
+
+@var_32 = external global i16
+@var_35 = external global i64
+@var_39 = external global i64
+@var_46 = external global i64
+@var_50 = external global i32
+
+define void @f(i1 %var_0, i16 %var_1, i64 %var_2, i8 %var_3, i16 %var_4, i1 %var_5, i32 %var_6, i32 %var_7, i8 %var_10, i64 %var_11, i8 %var_14, i32 %var_15, i64 %var_16, ptr %arr_3, ptr %arr_4, ptr %arr_6, ptr %arr_7, ptr %arr_12, ptr %arr_13, ptr %arr_19, i64 %mul, i64 %conv35, i64 %idxprom138.us16, i8 %0, i8 %1, ptr %invariant.gep875.us) #0 {
+; CHECK-LABEL: f:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #240
+; CHECK-NEXT:    str x30, [sp, #144] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x28, x27, [sp, #160] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x26, x25, [sp, #176] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x24, x23, [sp, #192] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x22, x21, [sp, #208] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #224] // 16-byte Folded Spill
+; CHECK-NEXT:    str w6, [sp, #20] // 4-byte Folded Spill
+; CHECK-NEXT:    str w4, [sp, #72] // 4-byte Folded Spill
+; CHECK-NEXT:    str w3, [sp, #112] // 4-byte Folded Spill
+; CHECK-NEXT:    str w5, [sp, #36] // 4-byte Folded Spill
+; CHECK-NEXT:    tbz w5, #0, .LBB0_43
+; CHECK-NEXT:  // %bb.1: // %for.body41.lr.ph
+; CHECK-NEXT:    ldr x4, [sp, #312]
+; CHECK-NEXT:    ldr x14, [sp, #280]
+; CHECK-NEXT:    tbz w0, #0, .LBB0_42
+; CHECK-NEXT:  // %bb.2: // %for.body41.us.preheader
+; CHECK-NEXT:    ldrb w8, [sp, #368]
+; CHECK-NEXT:    ldrb w12, [sp, #256]
+; CHECK-NEXT:    ldr w26, [sp, #264]
+; CHECK-NEXT:    adrp x20, :got:var_50
+; CHECK-NEXT:    mov x28, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov w21, #36006 // =0x8ca6
+; CHECK-NEXT:    ldr x11, [sp, #376]
+; CHECK-NEXT:    ldrb w13, [sp, #360]
+; CHECK-NEXT:    ldp x17, x16, [sp, #296]
+; CHECK-NEXT:    mov w22, #1 // =0x1
+; CHECK-NEXT:    add x27, x14, #120
+; CHECK-NEXT:    ldr x18, [sp, #288]
+; CHECK-NEXT:    ldr x7, [sp, #272]
+; CHECK-NEXT:    ldr x5, [sp, #248]
+; CHECK-NEXT:    mov x10, xzr
+; CHECK-NEXT:    mov w23, wzr
+; CHECK-NEXT:    mov w30, wzr
+; CHECK-NEXT:    ldrb w19, [sp, #240]
+; CHECK-NEXT:    mov w25, wzr
+; CHECK-NEXT:    mov x24, xzr
+; CHECK-NEXT:    str w8, [sp, #108] // 4-byte Folded Spill
+; CHECK-NEXT:    mov x3, x26
+; CHECK-NEXT:    ldp x9, x8, [sp, #344]
+; CHECK-NEXT:    str w12, [sp, #92] // 4-byte Folded Spill
+; CHECK-NEXT:    mov w12, #1 // =0x1
+; CHECK-NEXT:    bic w12, w12, w0
+; CHECK-NEXT:    str w12, [sp, #76] // 4-byte Folded Spill
+; CHECK-NEXT:    mov w12, #48 // =0x30
+; CHECK-NEXT:    str x9, [sp, #136] // 8-byte Folded Spill
+; CHECK-NEXT:    ldp x9, x15, [sp, #328]
+; CHECK-NEXT:    madd x8, x8, x12, x9
+; CHECK-NEXT:    str x8, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    add x8, x26, w26, uxtw #1
+; CHECK-NEXT:    ldr x20, [x20, :got_lo12:var_50]
+; CHECK-NEXT:    str x26, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    str x14, [sp, #152] // 8-byte Folded Spill
+; CHECK-NEXT:    lsl x6, x8, #3
+; CHECK-NEXT:    add x8, x14, #120
+; CHECK-NEXT:    str x4, [sp, #24] // 8-byte Folded Spill
+; CHECK-NEXT:    str w19, [sp, #16] // 4-byte Folded Spill
+; CHECK-NEXT:    str x8, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    b .LBB0_4
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB0_3: // in Loop: Header=BB0_4 Depth=1
+; CHECK-NEXT:    ldr w19, [sp, #16] // 4-byte Folded Reload
+; CHECK-NEXT:    ldr x24, [sp, #40] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr x14, [sp, #152] // 8-byte Folded Reload
+; CHECK-NEXT:    mov w23, #1 // =0x1
+; CHECK-NEXT:    mov w30, #1 // =0x1
+; CHECK-NEXT:    mov w25, w19
+; CHECK-NEXT:  .LBB0_4: // %for.body41.us
+; CHECK-NEXT:    // =>This Loop Header: Depth=1
+; CHECK-NEXT:    // Child Loop BB0_6 Depth 2
+; CHECK-NEXT:    // Child Loop BB0_8 Depth 3
+; CHECK-NEXT:    // Child Loop BB0_10 Depth 4
+; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
+; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
+; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
+; CHECK-NEXT:    ldr w8, [sp, #20] // 4-byte Folded Reload
+; CHECK-NEXT:    mov x12, x24
+; CHECK-NEXT:    str x24, [sp, #48] // 8-byte Folded Spill
+; CHECK-NEXT:    str w8, [x14]
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    strb w19, [x14]
+; CHECK-NEXT:    b .LBB0_6
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB0_5: // %for.cond.cleanup93.us
+; CHECK-NEXT:    // in Loop: Header=BB0_6 Depth=2
+; CHECK-NEXT:    ldr w9, [sp, #36] // 4-byte Folded Reload
+; CHECK-NEXT:    ldr x4, [sp, #24] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x24, x12, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    mov x22, xzr
+; CHECK-NEXT:    mov w25, wzr
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    tbz w9, #0, .LBB0_3
+; CHECK-NEXT:  .LBB0_6: // %for.body67.us
+; CHECK-NEXT:    // Parent Loop BB0_4 Depth=1
+; CHECK-NEXT:    // => This Loop Header: Depth=2
+; CHECK-NEXT:    // Child Loop BB0_8 Depth 3
+; CHECK-NEXT:    // Child Loop BB0_10 Depth 4
+; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
+; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
+; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
+; CHECK-NEXT:    str x12, [sp, #40] // 8-byte Folded Spill
+; CHECK-NEXT:    cmn x24, #30
+; CHECK-NEXT:    mov x12, #-30 // =0xffffffffffffffe2
+; CHECK-NEXT:    add x19, x4, w8, sxtw #2
+; CHECK-NEXT:    mov x9, xzr
+; CHECK-NEXT:    csel x12, x24, x12, lo
+; CHECK-NEXT:    mov w4, w30
+; CHECK-NEXT:    str x12, [sp, #56] // 8-byte Folded Spill
+; CHECK-NEXT:    b .LBB0_8
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB0_7: // %for.cond.cleanup98.us
+; CHECK-NEXT:    // in Loop: Header=BB0_8 Depth=3
+; CHECK-NEXT:    ldr w4, [sp, #72] // 4-byte Folded Reload
+; CHECK-NEXT:    ldr w23, [sp, #128] // 4-byte Folded Reload
+; CHECK-NEXT:    mov w9, #1 // =0x1
+; CHECK-NEXT:    mov x22, xzr
+; CHECK-NEXT:    tbnz w0, #0, .LBB0_5
+; CHECK-NEXT:  .LBB0_8: // %for.cond95.preheader.us
+; CHECK-NEXT:    // Parent Loop BB0_4 Depth=1
+; CHECK-NEXT:    // Parent Loop BB0_6 Depth=2
+; CHECK-NEXT:    // => This Loop Header: Depth=3
+; CHECK-NEXT:    // Child Loop BB0_10 Depth 4
+; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
+; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
+; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
+; CHECK-NEXT:    ldr x8, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    mov w14, #1152 // =0x480
+; CHECK-NEXT:    mov w24, #1 // =0x1
+; CHECK-NEXT:    mov w12, wzr
+; CHECK-NEXT:    str wzr, [sp, #132] // 4-byte Folded Spill
+; CHECK-NEXT:    mov w30, w4
+; CHECK-NEXT:    madd x8, x9, x14, x8
+; CHECK-NEXT:    mov w14, #1 // =0x1
+; CHECK-NEXT:    str x8, [sp, #120] // 8-byte Folded Spill
+; CHECK-NEXT:    add x8, x9, x9, lsl #1
+; CHECK-NEXT:    lsl x26, x8, #4
+; CHECK-NEXT:    sxtb w8, w23
+; CHECK-NEXT:    mov w23, w25
+; CHECK-NEXT:    str w8, [sp, #116] // 4-byte Folded Spill
+; CHECK-NEXT:    b .LBB0_10
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB0_9: // %for.cond510.preheader.us
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    ldr w23, [sp, #92] // 4-byte Folded Reload
+; CHECK-NEXT:    mov x22, x8
+; CHECK-NEXT:    ldr x3, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr x27, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT:    mov x28, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov x14, xzr
+; CHECK-NEXT:    ldr w8, [sp, #76] // 4-byte Folded Reload
+; CHECK-NEXT:    tbz w8, #31, .LBB0_7
+; CHECK-NEXT:  .LBB0_10: // %for.body99.us
+; CHECK-NEXT:    // Parent Loop BB0_4 Depth=1
+; CHECK-NEXT:    // Parent Loop BB0_6 Depth=2
+; CHECK-NEXT:    // Parent Loop BB0_8 Depth=3
+; CHECK-NEXT:    // => This Loop Header: Depth=4
+; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
+; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
+; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
+; CHECK-NEXT:    ldr w8, [sp, #116] // 4-byte Folded Reload
+; CHECK-NEXT:    and w8, w8, w8, asr #31
+; CHECK-NEXT:    str w8, [sp, #128] // 4-byte Folded Spill
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB0_11: // %for.body113.us
+; CHECK-NEXT:    // Parent Loop BB0_4 Depth=1
+; CHECK-NEXT:    // Parent Loop BB0_6 Depth=2
+; CHECK-NEXT:    // Parent Loop BB0_8 Depth=3
+; CHECK-NEXT:    // Parent Loop BB0_10 Depth=4
+; CHECK-NEXT:    // => This Inner Loop Header: Depth=5
+; CHECK-NEXT:    tbnz w0, #0, .LBB0_11
+; CHECK-NEXT:  // %bb.12: // %for.cond131.preheader.us
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    ldr w8, [sp, #112] // 4-byte Folded Reload
+; CHECK-NEXT:    mov w4, #1 // =0x1
+; CHECK-NEXT:    strb w8, [x18]
+; CHECK-NEXT:    ldr x8, [sp, #120] // 8-byte Folded Reload
+; CHECK-NEXT:    ldrh w8, [x8]
+; CHECK-NEXT:    cbnz w4, .LBB0_14
+; CHECK-NEXT:  // %bb.13: // %cond.true146.us
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    ldrsb w4, [x27, x3]
+; CHECK-NEXT:    b .LBB0_15
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB0_14: // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    mov w4, wzr
+; CHECK-NEXT:  .LBB0_15: // %cond.end154.us
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    mov w25, #18984 // =0x4a28
+; CHECK-NEXT:    mul w8, w8, w25
+; CHECK-NEXT:    and w8, w8, #0xfff8
+; CHECK-NEXT:    lsl w8, w8, w4
+; CHECK-NEXT:    cbz w8, .LBB0_17
+; CHECK-NEXT:  // %bb.16: // %if.then.us
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    str wzr, [sp, #132] // 4-byte Folded Spill
+; CHECK-NEXT:    str wzr, [x18]
+; CHECK-NEXT:  .LBB0_17: // %if.end.us
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    ldr w8, [sp, #108] // 4-byte Folded Reload
+; CHECK-NEXT:    mov w4, #18984 // =0x4a28
+; CHECK-NEXT:    mov w25, w23
+; CHECK-NEXT:    strb w8, [x18]
+; CHECK-NEXT:    ldrsb w8, [x27, x3]
+; CHECK-NEXT:    lsl w8, w4, w8
+; CHECK-NEXT:    mov x4, #-18403 // =0xffffffffffffb81d
+; CHECK-NEXT:    movk x4, #58909, lsl #16
+; CHECK-NEXT:    cbz w8, .LBB0_19
+; CHECK-NEXT:  // %bb.18: // %if.then.us.2
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    str wzr, [sp, #132] // 4-byte Folded Spill
+; CHECK-NEXT:    strb wzr, [x18]
+; CHECK-NEXT:  .LBB0_19: // %if.then.us.5
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    ldr w23, [sp, #132] // 4-byte Folded Reload
+; CHECK-NEXT:    mov w8, #29625 // =0x73b9
+; CHECK-NEXT:    movk w8, #21515, lsl #16
+; CHECK-NEXT:    cmp w23, w8
+; CHECK-NEXT:    csel w23, w23, w8, lt
+; CHECK-NEXT:    str w23, [sp, #132] // 4-byte Folded Spill
+; CHECK-NEXT:    tbz w0, #0, .LBB0_21
+; CHECK-NEXT:  // %bb.20: // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    b .LBB0_22
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB0_21: // %cond.true146.us.7
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    ldrsb w8, [x27, x3]
+; CHECK-NEXT:  .LBB0_22: // %cond.end154.us.7
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    mov w23, #18984 // =0x4a28
+; CHECK-NEXT:    mov w3, #149 // =0x95
+; CHECK-NEXT:    lsl w8, w23, w8
+; CHECK-NEXT:    cbz w8, .LBB0_24
+; CHECK-NEXT:  // %bb.23: // %if.then.us.7
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    ldr x8, [sp, #152] // 8-byte Folded Reload
+; CHECK-NEXT:    str wzr, [sp, #132] // 4-byte Folded Spill
+; CHECK-NEXT:    str wzr, [x8]
+; CHECK-NEXT:  .LBB0_24: // %if.end.us.7
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    mov x23, xzr
+; CHECK-NEXT:    b .LBB0_28
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB0_25: // %cond.true331.us
+; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT:    ldrsb w4, [x10]
+; CHECK-NEXT:  .LBB0_26: // %cond.end345.us
+; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT:    strh w4, [x18]
+; CHECK-NEXT:    mul x4, x22, x28
+; CHECK-NEXT:    adrp x22, :got:var_46
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    ldr x22, [x22, :got_lo12:var_46]
+; CHECK-NEXT:    str x4, [x22]
+; CHECK-NEXT:    mov x4, #-18403 // =0xffffffffffffb81d
+; CHECK-NEXT:    movk x4, #58909, lsl #16
+; CHECK-NEXT:  .LBB0_27: // %for.inc371.us
+; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT:    mov w22, #-18978 // =0xffffb5de
+; CHECK-NEXT:    orr x23, x23, #0x1
+; CHECK-NEXT:    mov x24, xzr
+; CHECK-NEXT:    mul w12, w12, w22
+; CHECK-NEXT:    mov x22, x5
+; CHECK-NEXT:    tbz w0, #0, .LBB0_36
+; CHECK-NEXT:  .LBB0_28: // %for.body194.us
+; CHECK-NEXT:    // Parent Loop BB0_4 Depth=1
+; CHECK-NEXT:    // Parent Loop BB0_6 Depth=2
+; CHECK-NEXT:    // Parent Loop BB0_8 Depth=3
+; CHECK-NEXT:    // Parent Loop BB0_10 Depth=4
+; CHECK-NEXT:    // => This Inner Loop Header: Depth=5
+; CHECK-NEXT:    cbnz wzr, .LBB0_30
+; CHECK-NEXT:  // %bb.29: // %if.then222.us
+; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT:    adrp x27, :got:var_32
+; CHECK-NEXT:    ldur w8, [x19, #-12]
+; CHECK-NEXT:    ldr x27, [x27, :got_lo12:var_32]
+; CHECK-NEXT:    strh w8, [x27]
+; CHECK-NEXT:    sxtb w8, w25
+; CHECK-NEXT:    bic w25, w8, w8, asr #31
+; CHECK-NEXT:    b .LBB0_31
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB0_30: // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT:    mov w25, wzr
+; CHECK-NEXT:  .LBB0_31: // %if.end239.us
+; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT:    strb w3, [x16]
+; CHECK-NEXT:    tst w13, #0xff
+; CHECK-NEXT:    b.eq .LBB0_33
+; CHECK-NEXT:  // %bb.32: // %if.then254.us
+; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT:    ldrh w8, [x26, x14, lsl #1]
+; CHECK-NEXT:    adrp x27, :got:var_35
+; CHECK-NEXT:    ldr x27, [x27, :got_lo12:var_35]
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    csel x8, xzr, x7, eq
+; CHECK-NEXT:    str x8, [x27]
+; CHECK-NEXT:    strh w1, [x17]
+; CHECK-NEXT:  .LBB0_33: // %if.end282.us
+; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT:    orr x27, x24, x4
+; CHECK-NEXT:    adrp x8, :got:var_39
+; CHECK-NEXT:    str x27, [x18]
+; CHECK-NEXT:    ldr x8, [x8, :got_lo12:var_39]
+; CHECK-NEXT:    str x10, [x8]
+; CHECK-NEXT:    ldrb w8, [x6, x9]
+; CHECK-NEXT:    str x8, [x18]
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    cbnz x2, .LBB0_27
+; CHECK-NEXT:  // %bb.34: // %if.then327.us
+; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT:    cbz w8, .LBB0_25
+; CHECK-NEXT:  // %bb.35: // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT:    mov w4, wzr
+; CHECK-NEXT:    b .LBB0_26
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB0_36: // %for.cond376.preheader.us
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    mov w3, #1152 // =0x480
+; CHECK-NEXT:    mov x22, xzr
+; CHECK-NEXT:    mov w4, wzr
+; CHECK-NEXT:    mov x24, x27
+; CHECK-NEXT:    lsl x23, x14, #1
+; CHECK-NEXT:    mov x27, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    madd x14, x14, x3, x11
+; CHECK-NEXT:    mov w28, w30
+; CHECK-NEXT:    mov w3, #-7680 // =0xffffe200
+; CHECK-NEXT:    b .LBB0_39
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB0_37: // %if.then466.us
+; CHECK-NEXT:    // in Loop: Header=BB0_39 Depth=5
+; CHECK-NEXT:    ldr x28, [sp, #152] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr x3, [sp, #136] // 8-byte Folded Reload
+; CHECK-NEXT:    sxtb w4, w4
+; CHECK-NEXT:    bic w4, w4, w4, asr #31
+; CHECK-NEXT:    str x3, [x28]
+; CHECK-NEXT:    mov w3, #-7680 // =0xffffe200
+; CHECK-NEXT:  .LBB0_38: // %for.inc505.us
+; CHECK-NEXT:    // in Loop: Header=BB0_39 Depth=5
+; CHECK-NEXT:    add x22, x22, #1
+; CHECK-NEXT:    add x27, x27, #1
+; CHECK-NEXT:    mov w28, wzr
+; CHECK-NEXT:    cmp x27, #0
+; CHECK-NEXT:    b.hs .LBB0_9
+; CHECK-NEXT:  .LBB0_39: // %for.body380.us
+; CHECK-NEXT:    // Parent Loop BB0_4 Depth=1
+; CHECK-NEXT:    // Parent Loop BB0_6 Depth=2
+; CHECK-NEXT:    // Parent Loop BB0_8 Depth=3
+; CHECK-NEXT:    // Parent Loop BB0_10 Depth=4
+; CHECK-NEXT:    // => This Inner Loop Header: Depth=5
+; CHECK-NEXT:    mov w30, w28
+; CHECK-NEXT:    ldrh w28, [x23]
+; CHECK-NEXT:    tst w0, #0x1
+; CHECK-NEXT:    strh w28, [x11]
+; CHECK-NEXT:    csel w28, w21, w3, ne
+; CHECK-NEXT:    str w28, [x20]
+; CHECK-NEXT:    cbz x15, .LBB0_38
+; CHECK-NEXT:  // %bb.40: // %if.then436.us
+; CHECK-NEXT:    // in Loop: Header=BB0_39 Depth=5
+; CHECK-NEXT:    ldrh w28, [x14]
+; CHECK-NEXT:    cbnz w28, .LBB0_37
+; CHECK-NEXT:  // %bb.41: // in Loop: Header=BB0_39 Depth=5
+; CHECK-NEXT:    mov w4, wzr
+; CHECK-NEXT:    b .LBB0_38
+; CHECK-NEXT:  .LBB0_42: // %for.body41
+; CHECK-NEXT:    strb wzr, [x4]
+; CHECK-NEXT:    strb wzr, [x14]
+; CHECK-NEXT:  .LBB0_43: // %for.cond563.preheader
+; CHECK-NEXT:    ldp x20, x19, [sp, #224] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x22, x21, [sp, #208] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x24, x23, [sp, #192] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x26, x25, [sp, #176] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x28, x27, [sp, #160] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #240
+; CHECK-NEXT:    ret
+entry:
+  br i1 %var_5, label %for.body41.lr.ph, label %for.cond563.preheader
+
+for.body41.lr.ph:                                 ; preds = %entry
+  %arrayidx147 = getelementptr i8, ptr %arr_3, i64 120
+  %tobool326.not = icmp eq i64 %var_2, 0
+  %not353 = xor i64 0, -1
+  %add538 = select i1 %var_0, i16 0, i16 1
+  br i1 %var_0, label %for.body41.us, label %for.body41
+
+for.body41.us:                                    ; preds = %for.cond.cleanup93.us, %for.body41.lr.ph
+  %var_24.promoted9271009.us = phi i64 [ 0, %for.body41.lr.ph ], [ %6, %for.cond.cleanup93.us ]
+  %var_37.promoted9301008.us = phi i64 [ 1, %for.body41.lr.ph ], [ 0, %for.cond.cleanup93.us ]
+  %2 = phi i8 [ 0, %for.body41.lr.ph ], [ 1, %for.cond.cleanup93.us ]
+  %add4139751001.us = phi i16 [ 0, %for.body41.lr.ph ], [ 1, %for.cond.cleanup93.us ]
+  %3 = phi i8 [ 0, %for.body41.lr.ph ], [ %var_10, %for.cond.cleanup93.us ]
+  store i32 %var_6, ptr %arr_3, align 4
+  store i8 %var_10, ptr %arr_3, align 1
+  br label %for.body67.us
+
+for.body67.us:                                    ; preds = %for.cond.cleanup93.us, %for.body41.us
+  %4 = phi i8 [ %3, %for.body41.us ], [ 0, %for.cond.cleanup93.us ]
+  %add413977.us = phi i16 [ %add4139751001.us, %for.body41.us ], [ %add413.us17, %for.cond.cleanup93.us ]
+  %5 = phi i8 [ %2, %for.body41.us ], [ %.sroa.speculated829.us, %for.cond.cleanup93.us ]
+  %conv64922.us = phi i32 [ 1, %for.body41.us ], [ 0, %for.cond.cleanup93.us ]
+  %6 = phi i64 [ %var_24.promoted9271009.us, %for.body41.us ], [ %.sroa.speculated832.us, %for.cond.cleanup93.us ]
+  %mul354903918.us = phi i64 [ %var_37.promoted9301008.us, %for.body41.us ], [ 0, %for.cond.cleanup93.us ]
+  %i_2.0921.us = zext i32 %var_15 to i64
+  %.sroa.speculated832.us = tail call i64 @llvm.umin.i64(i64 %var_24.promoted9271009.us, i64 -30)
+  %sext1023 = shl i64 %i_2.0921.us, 1
+  %idxprom138.us162 = ashr i64 %sext1023, 1
+  %gep889.us = getelementptr [24 x i16], ptr %arr_19, i64 %idxprom138.us16
+  %arrayidx149.us = getelementptr i8, ptr %arrayidx147, i64 %idxprom138.us162
+  %arrayidx319.us = getelementptr [24 x i8], ptr null, i64 %idxprom138.us162
+  %7 = sext i32 %conv64922.us to i64
+  %8 = getelementptr i32, ptr %arr_12, i64 %7
+  %arrayidx226.us = getelementptr i8, ptr %8, i64 -12
+  br label %for.cond95.preheader.us
+
+for.cond.cleanup93.us:                            ; preds = %for.cond.cleanup98.us
+  br i1 %var_5, label %for.body67.us, label %for.body41.us
+
+for.cond.cleanup98.us:                            ; preds = %for.cond510.preheader.us
+  br i1 %var_0, label %for.cond.cleanup93.us, label %for.cond95.preheader.us
+
+for.body99.us:                                    ; preds = %for.cond95.preheader.us, %for.cond510.preheader.us
+  %mul287985.us = phi i16 [ 0, %for.cond95.preheader.us ], [ %mul287.us, %for.cond510.preheader.us ]
+  %9 = phi i8 [ %29, %for.cond95.preheader.us ], [ %var_14, %for.cond510.preheader.us ]
+  %add413979.us = phi i16 [ %add413978.us, %for.cond95.preheader.us ], [ %add413.us17, %for.cond510.preheader.us ]
+  %10 = phi i32 [ 0, %for.cond95.preheader.us ], [ %26, %for.cond510.preheader.us ]
+  %mul354905.us = phi i64 [ %mul354904.us, %for.cond95.preheader.us ], [ %mul354907.us, %for.cond510.preheader.us ]
+  %sub283896.us = phi i64 [ 1, %for.cond95.preheader.us ], [ %sub283.us, %for.cond510.preheader.us ]
+  %conv96880.us = phi i64 [ 1, %for.cond95.preheader.us ], [ 0, %for.cond510.preheader.us ]
+  %.sroa.speculated829.us = tail call i8 @llvm.smin.i8(i8 %30, i8 0)
+  br label %for.body113.us
+
+for.body380.us:                                   ; preds = %for.cond376.preheader.us, %for.inc505.us
+  %indvars.iv1018 = phi i64 [ 0, %for.cond376.preheader.us ], [ %indvars.iv.next1019, %for.inc505.us ]
+  %11 = phi i8 [ 0, %for.cond376.preheader.us ], [ %13, %for.inc505.us ]
+  %add413980.us = phi i16 [ %add413979.us, %for.cond376.preheader.us ], [ 0, %for.inc505.us ]
+  %12 = load i16, ptr %arrayidx384.us, align 2
+  store i16 %12, ptr %invariant.gep875.us, align 2
+  %add413.us17 = or i16 %add413980.us, 0
+  %arrayidx416.us = getelementptr i16, ptr %arr_13, i64 %indvars.iv1018
+  %conv419.us = select i1 %var_0, i32 36006, i32 -7680
+  store i32 %conv419.us, ptr @var_50, align 4
+  %tobool435.not.us = icmp eq i64 %mul, 0
+  br i1 %tobool435.not.us, label %for.inc505.us, label %if.then436.us
+
+if.then436.us:                                    ; preds = %for.body380.us
+  %.sroa.speculated817.us = tail call i8 @llvm.smax.i8(i8 %11, i8 0)
+  %cond464.in.us = load i16, ptr %gep876.us, align 2
+  %tobool465.not.us = icmp eq i16 %cond464.in.us, 0
+  br i1 %tobool465.not.us, label %for.inc505.us, label %if.then466.us
+
+if.then466.us:                                    ; preds = %if.then436.us
+  store i64 %conv35, ptr %arr_3, align 8
+  br label %for.inc505.us
+
+for.inc505.us:                                    ; preds = %if.then466.us, %if.then436.us, %for.body380.us
+  %13 = phi i8 [ %11, %for.body380.us ], [ %.sroa.speculated817.us, %if.then466.us ], [ 0, %if.then436.us ]
+  %indvars.iv.next1019 = add i64 %indvars.iv1018, 1
+  %cmp378.us = icmp ult i64 %indvars.iv1018, 0
+  br i1 %cmp378.us, label %for.body380.us, label %for.cond510.preheader.us
+
+for.body194.us:                                   ; preds = %if.end.us.7, %for.inc371.us
+  %indvars.iv = phi i64 [ 0, %if.end.us.7 ], [ %indvars.iv.next, %for.inc371.us ]
+  %mul287986.us = phi i16 [ %mul287985.us, %if.end.us.7 ], [ %mul287.us, %for.inc371.us ]
+  %14 = phi i8 [ %9, %if.end.us.7 ], [ %16, %for.inc371.us ]
+  %mul354906.us = phi i64 [ %mul354905.us, %if.end.us.7 ], [ %var_11, %for.inc371.us ]
+  %sub283897.us = phi i64 [ %sub283896.us, %if.end.us.7 ], [ 0, %for.inc371.us ]
+  %tobool221.not.us = icmp eq i32 1, 0
+  br i1 %tobool221.not.us, label %if.end239.us, label %if.then222.us
+
+if.then222.us:                                    ; preds = %for.body194.us
+  %15 = load i32, ptr %arrayidx226.us, align 4
+  %conv227.us = trunc i32 %15 to i16
+  store i16 %conv227.us, ptr @var_32, align 2
+  %.sroa.speculated820.us = tail call i8 @llvm.smax.i8(i8 %14, i8 0)
+  br label %if.end239.us
+
+if.end239.us:                                     ; preds = %if.then222.us, %for.body194.us
+  %16 = phi i8 [ %.sroa.speculated820.us, %if.then222.us ], [ 0, %for.body194.us ]
+  store i8 -107, ptr %arr_7, align 1
+  %tobool253.not.us = icmp eq i8 %0, 0
+  br i1 %tobool253.not.us, label %if.end282.us, label %if.then254.us
+
+if.then254.us:                                    ; preds = %if.end239.us
+  %17 = load i16, ptr %arrayidx259.us, align 2
+  %tobool261.not.us = icmp eq i16 %17, 0
+  %conv268.us = select i1 %tobool261.not.us, i64 0, i64 %var_16
+  store i64 %conv268.us, ptr @var_35, align 8
+  %gep867.us = getelementptr [24 x [24 x i64]], ptr null, i64 %indvars.iv
+  store i16 %var_1, ptr %arr_6, align 2
+  br label %if.end282.us
+
+if.end282.us:                                     ; preds = %if.then254.us, %if.end239.us
+  %sub283.us = or i64 %sub283897.us, -434259939
+  store i64 %sub283.us, ptr %arr_4, align 8
+  %mul287.us = mul i16 %mul287986.us, -18978
+  store i64 0, ptr @var_39, align 8
+  %18 = load i8, ptr %arrayidx321.us, align 1
+  %conv322.us = zext i8 %18 to i64
+  store i64 %conv322.us, ptr %arr_4, align 8
+  br i1 %tobool326.not, label %if.then327.us, label %for.inc371.us
+
+if.then327.us:                                    ; preds = %if.end282.us
+  %tobool330.not.us = icmp eq i32 0, 0
+  br i1 %tobool330.not.us, label %cond.end345.us, label %cond.true331.us
+
+cond.true331.us:                                  ; preds = %if.then327.us
+  %19 = load i8, ptr null, align 1
+  %20 = sext i8 %19 to i16
+  br label %cond.end345.us
+
+cond.end345.us:                                   ; preds = %cond.true331.us, %if.then327.us
+  %cond346.us = phi i16 [ %20, %cond.true331.us ], [ 0, %if.then327.us ]
+  store i16 %cond346.us, ptr %arr_4, align 2
+  %mul354.us = mul i64 %mul354906.us, %not353
+  store i64 %mul354.us, ptr @var_46, align 8
+  br label %for.inc371.us
+
+for.inc371.us:                                    ; preds = %cond.end345.us, %if.end282.us
+  %mul354907.us = phi i64 [ 1, %if.end282.us ], [ 0, %cond.end345.us ]
+  %indvars.iv.next = or i64 %indvars.iv, 1
+  br i1 %var_0, label %for.body194.us, label %for.cond376.preheader.us
+
+cond.true146.us:                                  ; preds = %for.cond131.preheader.us
+  %21 = load i8, ptr %arrayidx149.us, align 1
+  %conv150.us = sext i8 %21 to i32
+  br label %cond.end154.us
+
+cond.end154.us:                                   ; preds = %for.cond131.preheader.us, %cond.true146.us
+  %cond155.us = phi i32 [ %conv150.us, %cond.true146.us ], [ 0, %for.cond131.preheader.us ]
+  %shl.us = shl i32 %div.us, %cond155.us
+  %tobool157.not.us = icmp eq i32 %shl.us, 0
+  br i1 %tobool157.not.us, label %if.end.us, label %if.then.us
+
+if.then.us:                                       ; preds = %cond.end154.us
+  store i32 0, ptr %arr_4, align 4
+  br label %if.end.us
+
+if.end.us:                                        ; preds = %if.then.us, %cond.end154.us
+  %22 = phi i32 [ 0, %if.then.us ], [ %10, %cond.end154.us ]
+  store i8 %1, ptr %arr_4, align 1
+  call void @llvm.assume(i1 true)
+  %23 = load i8, ptr %arrayidx149.us, align 1
+  %conv150.us.2 = sext i8 %23 to i32
+  %shl.us.2 = shl i32 18984, %conv150.us.2
+  %tobool157.not.us.2 = icmp eq i32 %shl.us.2, 0
+  br i1 %tobool157.not.us.2, label %if.then.us.5, label %if.then.us.2
+
+if.then.us.2:                                     ; preds = %if.end.us
+  %.sroa.speculated826.us.2 = tail call i32 @llvm.smin.i32(i32 %10, i32 0)
+  store i8 0, ptr %arr_4, align 1
+  br label %if.then.us.5
+
+if.then.us.5:                                     ; preds = %if.then.us.2, %if.end.us
+  %24 = phi i32 [ 0, %if.then.us.2 ], [ %22, %if.end.us ]
+  %.sroa.speculated826.us.5 = tail call i32 @llvm.smin.i32(i32 %24, i32 1410036665)
+  br i1 %var_0, label %cond.end154.us.7, label %cond.true146.us.7
+
+cond.true146.us.7:                                ; preds = %if.then.us.5
+  %25 = load i8, ptr %arrayidx149.us, align 1
+  %conv150.us.7 = sext i8 %25 to i32
+  br label %cond.end154.us.7
+
+cond.end154.us.7:                                 ; preds = %cond.true146.us.7, %if.then.us.5
+  %cond155.us.7 = phi i32 [ %conv150.us.7, %cond.true146.us.7 ], [ 0, %if.then.us.5 ]
+  %shl.us.7 = shl i32 18984, %cond155.us.7
+  %tobool157.not.us.7 = icmp eq i32 %shl.us.7, 0
+  br i1 %tobool157.not.us.7, label %if.end.us.7, label %if.then.us.7
+
+if.then.us.7:                                     ; preds = %cond.end154.us.7
+  store i32 0, ptr %arr_3, align 4
+  br label %if.end.us.7
+
+if.end.us.7:                                      ; preds = %if.then.us.7, %cond.end154.us.7
+  %26 = phi i32 [ 0, %if.then.us.7 ], [ %.sroa.speculated826.us.5, %cond.end154.us.7 ]
+  %arrayidx259.us = getelementptr i16, ptr %arrayidx257.us, i64 %conv96880.us
+  br label %for.body194.us
+
+for.body113.us:                                   ; preds = %for.body113.us, %for.body99.us
+  br i1 %var_0, label %for.body113.us, label %for.cond131.preheader.us
+
+for.cond510.preheader.us:                         ; preds = %for.inc505.us
+  %cmp97.us = icmp slt i16 %add538, 0
+  br i1 %cmp97.us, label %for.body99.us, label %for.cond.cleanup98.us
+
+for.cond376.preheader.us:                         ; preds = %for.inc371.us
+  %arrayidx384.us = getelementptr i16, ptr null, i64 %conv96880.us
+  %gep876.us = getelementptr [24 x [24 x i16]], ptr %invariant.gep875.us, i64 %conv96880.us
+  br label %for.body380.us
+
+for.cond131.preheader.us:                         ; preds = %for.body113.us
+  store i8 %var_3, ptr %arr_4, align 1
+  %27 = load i16, ptr %gep884.us, align 2
+  %28 = mul i16 18984, %27
+  %div.us = zext i16 %28 to i32
+  %tobool145.not.us = icmp eq i8 0, 0
+  br i1 %tobool145.not.us, label %cond.end154.us, label %cond.true146.us
+
+for.cond95.preheader.us:                          ; preds = %for.cond.cleanup98.us, %for.body67.us
+  %indvars.iv1021 = phi i64 [ 1, %for.cond.cleanup98.us ], [ 0, %for.body67.us ]
+  %29 = phi i8 [ %16, %for.cond.cleanup98.us ], [ %4, %for.body67.us ]
+  %add413978.us = phi i16 [ %var_4, %for.cond.cleanup98.us ], [ %add413977.us, %for.body67.us ]
+  %30 = phi i8 [ %.sroa.speculated829.us, %for.cond.cleanup98.us ], [ %5, %for.body67.us ]
+  %mul354904.us = phi i64 [ 0, %for.cond.cleanup98.us ], [ %mul354903918.us, %for.body67.us ]
+  %gep884.us = getelementptr [24 x [24 x i16]], ptr %gep889.us, i64 %indvars.iv1021
+  %arrayidx321.us = getelementptr i8, ptr %arrayidx319.us, i64 %indvars.iv1021
+  %arrayidx257.us = getelementptr [24 x i16], ptr null, i64 %indvars.iv1021
+  br label %for.body99.us
+
+for.cond563.preheader:                            ; preds = %for.body41, %entry
+  ret void
+
+for.body41:                                       ; preds = %for.body41.lr.ph
+  store i8 0, ptr %arr_12, align 1
+  store i8 0, ptr %arr_3, align 1
+  br label %for.cond563.preheader
+}
+
+attributes #0 = { nounwind "frame-pointer"="non-leaf" "target-cpu"="grace" }
+attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-rdsvl.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-rdsvl.ll
index 06c53d8..b17b48c 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-rdsvl.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-rdsvl.ll
@@ -86,4 +86,111 @@ define i64 @sme_cntsd_mul() {
   ret i64 %res
 }
 
-declare i64 @llvm.aarch64.sme.cntsd()
+define i64 @sme_cntsb_mul_pos() {
+; CHECK-LABEL: sme_cntsb_mul_pos:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdsvl x8, #24
+; CHECK-NEXT:    lsl x0, x8, #2
+; CHECK-NEXT:    ret
+  %v = call i64 @llvm.aarch64.sme.cntsd()
+  %shl = shl nuw nsw i64 %v, 3
+  %res = mul nuw nsw i64 %shl, 96
+  ret i64 %res
+}
+
+define i64 @sme_cntsh_mul_pos() {
+; CHECK-LABEL: sme_cntsh_mul_pos:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdsvl x8, #3
+; CHECK-NEXT:    lsr x0, x8, #1
+; CHECK-NEXT:    ret
+  %v = call i64 @llvm.aarch64.sme.cntsd()
+  %shl = shl nuw nsw i64 %v, 2
+  %res = mul nuw nsw i64 %shl, 3
+  ret i64 %res
+}
+
+define i64 @sme_cntsw_mul_pos() {
+; CHECK-LABEL: sme_cntsw_mul_pos:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdsvl x8, #31
+; CHECK-NEXT:    lsr x0, x8, #1
+; CHECK-NEXT:    ret
+  %v = call i64 @llvm.aarch64.sme.cntsd()
+  %shl = shl nuw nsw i64 %v, 1
+  %res = mul nuw nsw i64 %shl, 62
+  ret i64 %res
+}
+
+define i64 @sme_cntsd_mul_pos() {
+; CHECK-LABEL: sme_cntsd_mul_pos:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdsvl x8, #31
+; CHECK-NEXT:    lsl x0, x8, #2
+; CHECK-NEXT:    ret
+  %v = call i64 @llvm.aarch64.sme.cntsd()
+  %res = mul nuw nsw i64 %v, 992
+  ret i64 %res
+}
+
+define i64 @sme_cntsb_mul_neg() {
+; CHECK-LABEL: sme_cntsb_mul_neg:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdsvl x8, #-24
+; CHECK-NEXT:    lsl x0, x8, #2
+; CHECK-NEXT:    ret
+  %v = call i64 @llvm.aarch64.sme.cntsd()
+  %shl = shl nuw nsw i64 %v, 3
+  %res = mul nuw nsw i64 %shl, -96
+  ret i64 %res
+}
+
+define i64 @sme_cntsh_mul_neg() {
+; CHECK-LABEL: sme_cntsh_mul_neg:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdsvl x8, #-3
+; CHECK-NEXT:    lsr x0, x8, #1
+; CHECK-NEXT:    ret
+  %v = call i64 @llvm.aarch64.sme.cntsd()
+  %shl = shl nuw nsw i64 %v, 2
+  %res = mul nuw nsw i64 %shl, -3
+  ret i64 %res
+}
+
+define i64 @sme_cntsw_mul_neg() {
+; CHECK-LABEL: sme_cntsw_mul_neg:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdsvl x8, #-31
+; CHECK-NEXT:    lsl x0, x8, #3
+; CHECK-NEXT:    ret
+  %v = call i64 @llvm.aarch64.sme.cntsd()
+  %shl = shl nuw nsw i64 %v, 1
+  %res = mul nuw nsw i64 %shl, -992
+  ret i64 %res
+}
+
+define i64 @sme_cntsd_mul_neg() {
+; CHECK-LABEL: sme_cntsd_mul_neg:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdsvl x8, #-3
+; CHECK-NEXT:    lsr x0, x8, #3
+; CHECK-NEXT:    ret
+  %v = call i64 @llvm.aarch64.sme.cntsd()
+  %res = mul nuw nsw  i64 %v, -3
+  ret i64 %res
+}
+
+; Negative test for optimization failure
+define i64 @sme_cntsd_mul_fail() {
+; CHECK-LABEL: sme_cntsd_mul_fail:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    mov w9, #993 // =0x3e1
+; CHECK-NEXT:    lsr x8, x8, #3
+; CHECK-NEXT:    mul x0, x8, x9
+; CHECK-NEXT:    ret
+  %v = call i64 @llvm.aarch64.sme.cntsd()
+  %res = mul nuw nsw i64 %v, 993
+  ret i64 %res
+}
+
diff --git a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
index b8d6c88..3f35cb5 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
@@ -829,7 +829,7 @@ define void @try_catch_agnostic_za_invoke() "aarch64_za_state_agnostic" personal
 ; CHECK-SDAG-NEXT:    bl __arm_sme_restore
 ; CHECK-SDAG-NEXT:    b .LBB5_1
 entry:
-  invoke void @agnostic_za_call()
+  invoke void @agnostic_za_call() "aarch64_za_state_agnostic"
           to label %exit unwind label %catch
 
 catch:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index 353c09b..ecd7cc2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -1778,7 +1778,7 @@ define i65 @v_ashr_i65_33(i65 %value) {
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[1:2], 31
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
-; GFX6-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1790,7 +1790,7 @@ define i65 @v_ashr_i65_33(i65 %value) {
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 31, v[1:2]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
-; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1802,7 +1802,7 @@ define i65 @v_ashr_i65_33(i65 %value) {
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 31, v[1:2]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
-; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1815,7 +1815,7 @@ define i65 @v_ashr_i65_33(i65 %value) {
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX10PLUS-NEXT:    v_lshlrev_b64 v[0:1], 31, v[1:2]
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
-; GFX10PLUS-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX10PLUS-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = ashr i65 %value, 33
   ret i65 %result
@@ -1875,21 +1875,19 @@ define amdgpu_ps i65 @s_ashr_i65_33(i65 inreg %value) {
 ; GCN-LABEL: s_ashr_i65_33:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GCN-NEXT:    s_lshr_b32 s0, s1, 1
-; GCN-NEXT:    s_mov_b32 s1, 0
-; GCN-NEXT:    s_lshl_b64 s[4:5], s[2:3], 31
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GCN-NEXT:    s_lshr_b32 s4, s1, 1
+; GCN-NEXT:    s_lshl_b64 s[0:1], s[2:3], 31
+; GCN-NEXT:    s_or_b32 s0, s0, s4
 ; GCN-NEXT:    s_ashr_i32 s2, s3, 1
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_ashr_i65_33:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GFX10PLUS-NEXT:    s_lshr_b32 s0, s1, 1
-; GFX10PLUS-NEXT:    s_mov_b32 s1, 0
-; GFX10PLUS-NEXT:    s_lshl_b64 s[4:5], s[2:3], 31
+; GFX10PLUS-NEXT:    s_lshr_b32 s4, s1, 1
+; GFX10PLUS-NEXT:    s_lshl_b64 s[0:1], s[2:3], 31
 ; GFX10PLUS-NEXT:    s_ashr_i32 s2, s3, 1
-; GFX10PLUS-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX10PLUS-NEXT:    s_or_b32 s0, s0, s4
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = ashr i65 %value, 33
   ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-or-s64-s32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-or-s64-s32.mir
new file mode 100644
index 0000000..48e9818
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-or-s64-s32.mir
@@ -0,0 +1,97 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=amdgpu-prelegalizer-combiner %s -o - | FileCheck %s
+
+---
+name: test_combine_or_s64_s32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr0_sgpr1, $sgpr2
+    ; CHECK-LABEL: name: test_combine_or_s64_s32
+    ; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr2
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV]], [[COPY1]]
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[UV1]](s32)
+    ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[MV]](s64)
+    ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+    %0:_(s64) = COPY $sgpr0_sgpr1
+    %1:_(s32) = COPY $sgpr2
+    %2:_(s64) = G_ZEXT %1(s32)
+    %3:_(s64) = G_OR %0, %2
+    $sgpr0_sgpr1 = COPY %3(s64)
+    SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+...
+---
+name: test_combine_or_s64_s32_rhs
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr0_sgpr1, $sgpr2
+    ; CHECK-LABEL: name: test_combine_or_s64_s32_rhs
+    ; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr2
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV]], [[COPY1]]
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[UV1]](s32)
+    ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[MV]](s64)
+    ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+    %0:_(s64) = COPY $sgpr0_sgpr1
+    %1:_(s32) = COPY $sgpr2
+    %2:_(s64) = G_ZEXT %1(s32)
+    %3:_(s64) = G_OR %2, %0
+    $sgpr0_sgpr1 = COPY %3(s64)
+    SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+...
+---
+name: test_combine_or_s64_s32_merge_unmerge
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2
+    ; CHECK-LABEL: name: test_combine_or_s64_s32_merge_unmerge
+    ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY]], [[COPY2]]
+    ; CHECK-NEXT: $sgpr0 = COPY [[OR]](s32)
+    ; CHECK-NEXT: $sgpr1 = COPY [[COPY1]](s32)
+    ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32) = COPY $sgpr2
+    %3:_(s64) = G_MERGE_VALUES %0(s32), %1(s32)
+    %4:_(s64) = G_ZEXT %2(s32)
+    %5:_(s64) = G_OR %3, %4
+    %6:_(s32), %7:_(s32) = G_UNMERGE_VALUES %5(s64)
+    $sgpr0 = COPY %6(s32)
+    $sgpr1 = COPY %7(s32)
+    SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+...
+---
+name: negative_test_incorrect_types
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5
+    ; CHECK-LABEL: name: negative_test_incorrect_types
+    ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr4_vgpr5
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s128) = G_ZEXT [[COPY1]](s64)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s128) = G_OR [[COPY]], [[ZEXT]]
+    ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[OR]](s128)
+    %0:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+    %1:_(s64) = COPY $vgpr4_vgpr5
+    %2:_(s128) = G_ZEXT %1
+    %3:_(s128) = G_OR %0, %2
+    $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
index 5dff8c1..667fa98 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
@@ -227,39 +227,38 @@ exit:
 define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3 x i32> inreg %.WorkgroupId, <3 x i32> %.LocalInvocationId) #0 {
 ; GFX10-LABEL: single_lane_execution_attribute:
 ; GFX10:       ; %bb.0: ; %.entry
-; GFX10-NEXT:    s_getpc_b64 s[12:13]
-; GFX10-NEXT:    s_mov_b32 s12, 0
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_mov_b32 s2, s0
-; GFX10-NEXT:    s_mov_b32 s3, s12
+; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    v_mbcnt_lo_u32_b32 v1, -1, 0
-; GFX10-NEXT:    s_or_b64 s[2:3], s[12:13], s[2:3]
 ; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
 ; GFX10-NEXT:    v_mbcnt_hi_u32_b32 v1, -1, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 1, v1
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
 ; GFX10-NEXT:    s_xor_b32 s2, vcc_lo, exec_lo
-; GFX10-NEXT:    s_and_b32 vcc_lo, s2, exec_lo
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v2, v2, s[4:7], 0 offen
+; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX10-NEXT:    s_mov_b32 s2, 0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 0, v2
 ; GFX10-NEXT:    s_cbranch_vccnz .LBB4_4
 ; GFX10-NEXT:  ; %bb.1: ; %.preheader.preheader
-; GFX10-NEXT:    s_mov_b32 s2, 0
+; GFX10-NEXT:    s_mov_b32 s3, 0
 ; GFX10-NEXT:  .LBB4_2: ; %.preheader
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_mov_b32_e32 v3, s12
+; GFX10-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, -1, v1
-; GFX10-NEXT:    s_add_i32 s12, s12, 4
+; GFX10-NEXT:    s_add_i32 s2, s2, 4
 ; GFX10-NEXT:    buffer_load_dword v3, v3, s[4:7], 0 offen
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
-; GFX10-NEXT:    s_add_i32 s2, s3, s2
+; GFX10-NEXT:    v_readfirstlane_b32 s12, v3
+; GFX10-NEXT:    s_add_i32 s3, s12, s3
 ; GFX10-NEXT:    s_cbranch_vccnz .LBB4_2
 ; GFX10-NEXT:  ; %bb.3: ; %.preheader._crit_edge
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s2, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s3, v2
 ; GFX10-NEXT:    s_or_b32 s2, s0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
 ; GFX10-NEXT:    s_branch .LBB4_6
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index bd53032..715a777 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -4934,17 +4934,15 @@ define amdgpu_ps i64 @s_fshl_i64_5(i64 inreg %lhs, i64 inreg %rhs) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_lshl_b64 s[0:1], s[0:1], 5
 ; GCN-NEXT:    s_lshr_b32 s2, s3, 27
-; GCN-NEXT:    s_mov_b32 s3, 0
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    s_or_b32 s0, s0, s2
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshl_i64_5:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 5
 ; GFX11-NEXT:    s_lshr_b32 s2, s3, 27
-; GFX11-NEXT:    s_mov_b32 s3, 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX11-NEXT:    s_or_b32 s0, s0, s2
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 5)
   ret i64 %result
@@ -4954,20 +4952,13 @@ define amdgpu_ps i64 @s_fshl_i64_32(i64 inreg %lhs, i64 inreg %rhs) {
 ; GCN-LABEL: s_fshl_i64_32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s1, s0
-; GCN-NEXT:    s_mov_b32 s0, 0
-; GCN-NEXT:    s_mov_b32 s2, s3
-; GCN-NEXT:    s_mov_b32 s3, s0
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    s_mov_b32 s0, s3
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshl_i64_32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_mov_b32 s1, s0
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s2, s3
-; GFX11-NEXT:    s_mov_b32 s3, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX11-NEXT:    s_mov_b32 s0, s3
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 32)
   ret i64 %result
@@ -6823,56 +6814,50 @@ define amdgpu_ps i128 @s_fshl_i128_65(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[0:1], 1
 ; GFX6-NEXT:    s_lshr_b32 s4, s5, 31
-; GFX6-NEXT:    s_mov_b32 s5, 0
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], s[6:7], 1
-; GFX6-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
+; GFX6-NEXT:    s_or_b32 s0, s0, s4
 ; GFX6-NEXT:    s_lshr_b32 s4, s7, 31
-; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX6-NEXT:    s_or_b32 s2, s2, s4
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fshl_i128_65:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[0:1], 1
 ; GFX8-NEXT:    s_lshr_b32 s4, s5, 31
-; GFX8-NEXT:    s_mov_b32 s5, 0
 ; GFX8-NEXT:    s_lshl_b64 s[0:1], s[6:7], 1
-; GFX8-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
+; GFX8-NEXT:    s_or_b32 s0, s0, s4
 ; GFX8-NEXT:    s_lshr_b32 s4, s7, 31
-; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX8-NEXT:    s_or_b32 s2, s2, s4
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fshl_i128_65:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[0:1], 1
 ; GFX9-NEXT:    s_lshr_b32 s4, s5, 31
-; GFX9-NEXT:    s_mov_b32 s5, 0
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[6:7], 1
-; GFX9-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
+; GFX9-NEXT:    s_or_b32 s0, s0, s4
 ; GFX9-NEXT:    s_lshr_b32 s4, s7, 31
-; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX9-NEXT:    s_or_b32 s2, s2, s4
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fshl_i128_65:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_lshr_b32 s2, s5, 31
-; GFX10-NEXT:    s_mov_b32 s3, 0
-; GFX10-NEXT:    s_lshl_b64 s[4:5], s[6:7], 1
-; GFX10-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
-; GFX10-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX10-NEXT:    s_lshr_b32 s2, s7, 31
-; GFX10-NEXT:    s_or_b64 s[2:3], s[8:9], s[2:3]
+; GFX10-NEXT:    s_lshl_b64 s[2:3], s[0:1], 1
+; GFX10-NEXT:    s_lshr_b32 s4, s5, 31
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[6:7], 1
+; GFX10-NEXT:    s_lshr_b32 s5, s7, 31
+; GFX10-NEXT:    s_or_b32 s0, s0, s4
+; GFX10-NEXT:    s_or_b32 s2, s2, s5
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshl_i128_65:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s2, s5, 31
-; GFX11-NEXT:    s_mov_b32 s3, 0
-; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 1
-; GFX11-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
-; GFX11-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX11-NEXT:    s_lshr_b32 s2, s7, 31
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b64 s[2:3], s[8:9], s[2:3]
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[0:1], 1
+; GFX11-NEXT:    s_lshr_b32 s4, s5, 31
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[6:7], 1
+; GFX11-NEXT:    s_lshr_b32 s5, s7, 31
+; GFX11-NEXT:    s_or_b32 s0, s0, s4
+; GFX11-NEXT:    s_or_b32 s2, s2, s5
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 65)
   ret i128 %result
@@ -6885,7 +6870,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], v[0:1], 1
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[6:7], 1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
-; GFX6-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 31, v7
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -6896,7 +6881,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 1, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 1, v[6:7]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
-; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 31, v7
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -6907,7 +6892,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[6:7]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
-; GFX9-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 31, v7
 ; GFX9-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -6919,7 +6904,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[6:7]
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 31, v7
-; GFX10-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX10-NEXT:    v_or_b32_e32 v2, v2, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6931,7 +6916,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 31, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 65)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index ea6b3a3..5aa5a671 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -4715,20 +4715,13 @@ define amdgpu_ps i64 @s_fshr_i64_32(i64 inreg %lhs, i64 inreg %rhs) {
 ; GCN-LABEL: s_fshr_i64_32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s1, s0
-; GCN-NEXT:    s_mov_b32 s0, 0
-; GCN-NEXT:    s_mov_b32 s2, s3
-; GCN-NEXT:    s_mov_b32 s3, s0
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    s_mov_b32 s0, s3
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshr_i64_32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_mov_b32 s1, s0
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s2, s3
-; GFX11-NEXT:    s_mov_b32 s3, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX11-NEXT:    s_mov_b32 s0, s3
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 32)
   ret i64 %result
@@ -4739,17 +4732,15 @@ define amdgpu_ps i64 @s_fshr_i64_48(i64 inreg %lhs, i64 inreg %rhs) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
 ; GCN-NEXT:    s_lshr_b32 s2, s3, 16
-; GCN-NEXT:    s_mov_b32 s3, 0
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    s_or_b32 s0, s0, s2
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshr_i64_48:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
 ; GFX11-NEXT:    s_lshr_b32 s2, s3, 16
-; GFX11-NEXT:    s_mov_b32 s3, 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX11-NEXT:    s_or_b32 s0, s0, s2
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 48)
   ret i64 %result
@@ -5293,34 +5284,33 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX6-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX6-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX6-NEXT:    s_mov_b32 s1, 0
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX6-NEXT:    s_andn2_b32 s2, 0x7f, s8
+; GFX6-NEXT:    s_or_b32 s2, s2, s0
+; GFX6-NEXT:    s_andn2_b32 s0, 0x7f, s8
 ; GFX6-NEXT:    s_not_b32 s9, s8
-; GFX6-NEXT:    s_sub_i32 s16, s2, 64
-; GFX6-NEXT:    s_sub_i32 s12, 64, s2
-; GFX6-NEXT:    s_cmp_lt_u32 s2, 64
+; GFX6-NEXT:    s_sub_i32 s16, s0, 64
+; GFX6-NEXT:    s_sub_i32 s12, 64, s0
+; GFX6-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX6-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX6-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX6-NEXT:    s_lshr_b64 s[12:13], s[10:11], s12
-; GFX6-NEXT:    s_lshl_b64 s[14:15], s[0:1], s9
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[10:11], s9
+; GFX6-NEXT:    s_lshl_b64 s[14:15], s[2:3], s9
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[10:11], s9
 ; GFX6-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
 ; GFX6-NEXT:    s_lshl_b64 s[10:11], s[10:11], s16
 ; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
 ; GFX6-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
 ; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX6-NEXT:    s_cselect_b64 s[10:11], s[0:1], s[10:11]
-; GFX6-NEXT:    s_and_b32 s0, s8, 0x7f
-; GFX6-NEXT:    s_sub_i32 s14, s0, 64
-; GFX6-NEXT:    s_sub_i32 s12, 64, s0
-; GFX6-NEXT:    s_cmp_lt_u32 s0, 64
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[10:11]
+; GFX6-NEXT:    s_and_b32 s9, s8, 0x7f
+; GFX6-NEXT:    s_sub_i32 s14, s9, 64
+; GFX6-NEXT:    s_sub_i32 s12, 64, s9
+; GFX6-NEXT:    s_cmp_lt_u32 s9, 64
 ; GFX6-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s9, 0
 ; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[0:1], s[6:7], s8
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[6:7], s8
 ; GFX6-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
 ; GFX6-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
 ; GFX6-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
@@ -5330,9 +5320,9 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
 ; GFX6-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX6-NEXT:    s_cselect_b64 s[6:7], s[0:1], 0
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX6-NEXT:    s_or_b64 s[2:3], s[10:11], s[6:7]
+; GFX6-NEXT:    s_cselect_b64 s[6:7], s[10:11], 0
+; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fshr_i128:
@@ -5340,34 +5330,33 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX8-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX8-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX8-NEXT:    s_mov_b32 s1, 0
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX8-NEXT:    s_andn2_b32 s2, 0x7f, s8
+; GFX8-NEXT:    s_or_b32 s2, s2, s0
+; GFX8-NEXT:    s_andn2_b32 s0, 0x7f, s8
 ; GFX8-NEXT:    s_not_b32 s9, s8
-; GFX8-NEXT:    s_sub_i32 s16, s2, 64
-; GFX8-NEXT:    s_sub_i32 s12, 64, s2
-; GFX8-NEXT:    s_cmp_lt_u32 s2, 64
+; GFX8-NEXT:    s_sub_i32 s16, s0, 64
+; GFX8-NEXT:    s_sub_i32 s12, 64, s0
+; GFX8-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX8-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX8-NEXT:    s_lshr_b64 s[12:13], s[10:11], s12
-; GFX8-NEXT:    s_lshl_b64 s[14:15], s[0:1], s9
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[10:11], s9
+; GFX8-NEXT:    s_lshl_b64 s[14:15], s[2:3], s9
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[10:11], s9
 ; GFX8-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
 ; GFX8-NEXT:    s_lshl_b64 s[10:11], s[10:11], s16
 ; GFX8-NEXT:    s_cmp_lg_u32 s17, 0
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
 ; GFX8-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
 ; GFX8-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX8-NEXT:    s_cselect_b64 s[10:11], s[0:1], s[10:11]
-; GFX8-NEXT:    s_and_b32 s0, s8, 0x7f
-; GFX8-NEXT:    s_sub_i32 s14, s0, 64
-; GFX8-NEXT:    s_sub_i32 s12, 64, s0
-; GFX8-NEXT:    s_cmp_lt_u32 s0, 64
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[10:11]
+; GFX8-NEXT:    s_and_b32 s9, s8, 0x7f
+; GFX8-NEXT:    s_sub_i32 s14, s9, 64
+; GFX8-NEXT:    s_sub_i32 s12, 64, s9
+; GFX8-NEXT:    s_cmp_lt_u32 s9, 64
 ; GFX8-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s9, 0
 ; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[0:1], s[6:7], s8
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[6:7], s8
 ; GFX8-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
 ; GFX8-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
 ; GFX8-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
@@ -5377,9 +5366,9 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX8-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
 ; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX8-NEXT:    s_cselect_b64 s[6:7], s[0:1], 0
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX8-NEXT:    s_or_b64 s[2:3], s[10:11], s[6:7]
+; GFX8-NEXT:    s_cselect_b64 s[6:7], s[10:11], 0
+; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fshr_i128:
@@ -5387,34 +5376,33 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX9-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX9-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX9-NEXT:    s_mov_b32 s1, 0
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX9-NEXT:    s_andn2_b32 s2, 0x7f, s8
+; GFX9-NEXT:    s_or_b32 s2, s2, s0
+; GFX9-NEXT:    s_andn2_b32 s0, 0x7f, s8
 ; GFX9-NEXT:    s_not_b32 s9, s8
-; GFX9-NEXT:    s_sub_i32 s16, s2, 64
-; GFX9-NEXT:    s_sub_i32 s12, 64, s2
-; GFX9-NEXT:    s_cmp_lt_u32 s2, 64
+; GFX9-NEXT:    s_sub_i32 s16, s0, 64
+; GFX9-NEXT:    s_sub_i32 s12, 64, s0
+; GFX9-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX9-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX9-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX9-NEXT:    s_lshr_b64 s[12:13], s[10:11], s12
-; GFX9-NEXT:    s_lshl_b64 s[14:15], s[0:1], s9
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[10:11], s9
+; GFX9-NEXT:    s_lshl_b64 s[14:15], s[2:3], s9
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], s9
 ; GFX9-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
 ; GFX9-NEXT:    s_lshl_b64 s[10:11], s[10:11], s16
 ; GFX9-NEXT:    s_cmp_lg_u32 s17, 0
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
 ; GFX9-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX9-NEXT:    s_cselect_b64 s[10:11], s[0:1], s[10:11]
-; GFX9-NEXT:    s_and_b32 s0, s8, 0x7f
-; GFX9-NEXT:    s_sub_i32 s14, s0, 64
-; GFX9-NEXT:    s_sub_i32 s12, 64, s0
-; GFX9-NEXT:    s_cmp_lt_u32 s0, 64
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[10:11]
+; GFX9-NEXT:    s_and_b32 s9, s8, 0x7f
+; GFX9-NEXT:    s_sub_i32 s14, s9, 64
+; GFX9-NEXT:    s_sub_i32 s12, 64, s9
+; GFX9-NEXT:    s_cmp_lt_u32 s9, 64
 ; GFX9-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s9, 0
 ; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[0:1], s[6:7], s8
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[6:7], s8
 ; GFX9-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
 ; GFX9-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
 ; GFX9-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
@@ -5424,19 +5412,18 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
 ; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX9-NEXT:    s_cselect_b64 s[6:7], s[0:1], 0
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX9-NEXT:    s_or_b64 s[2:3], s[10:11], s[6:7]
+; GFX9-NEXT:    s_cselect_b64 s[6:7], s[10:11], 0
+; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fshr_i128:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT:    s_lshr_b32 s10, s1, 31
-; GFX10-NEXT:    s_mov_b32 s11, 0
-; GFX10-NEXT:    s_andn2_b32 s9, 0x7f, s8
+; GFX10-NEXT:    s_lshr_b32 s9, s1, 31
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[10:11]
+; GFX10-NEXT:    s_or_b32 s2, s2, s9
+; GFX10-NEXT:    s_andn2_b32 s9, 0x7f, s8
 ; GFX10-NEXT:    s_not_b32 s14, s8
 ; GFX10-NEXT:    s_sub_i32 s16, s9, 64
 ; GFX10-NEXT:    s_sub_i32 s10, 64, s9
@@ -5479,11 +5466,10 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX11-LABEL: s_fshr_i128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT:    s_lshr_b32 s10, s1, 31
-; GFX11-NEXT:    s_mov_b32 s11, 0
-; GFX11-NEXT:    s_and_not1_b32 s9, 0x7f, s8
+; GFX11-NEXT:    s_lshr_b32 s9, s1, 31
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[10:11]
+; GFX11-NEXT:    s_or_b32 s2, s2, s9
+; GFX11-NEXT:    s_and_not1_b32 s9, 0x7f, s8
 ; GFX11-NEXT:    s_not_b32 s14, s8
 ; GFX11-NEXT:    s_sub_i32 s16, s9, 64
 ; GFX11-NEXT:    s_sub_i32 s10, 64, s9
@@ -5786,13 +5772,12 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX6-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX6-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX6-NEXT:    s_mov_b32 s1, 0
 ; GFX6-NEXT:    v_bfi_b32 v7, v0, 0, v1
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX6-NEXT:    s_or_b32 s2, s2, s0
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 64, v7
 ; GFX6-NEXT:    v_not_b32_e32 v8, 63
 ; GFX6-NEXT:    v_lshr_b64 v[1:2], s[8:9], v1
-; GFX6-NEXT:    v_lshl_b64 v[3:4], s[0:1], v7
+; GFX6-NEXT:    v_lshl_b64 v[3:4], s[2:3], v7
 ; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v7, v8
 ; GFX6-NEXT:    v_lshl_b64 v[5:6], s[8:9], v7
 ; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
@@ -5803,8 +5788,8 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v3, s0
-; GFX6-NEXT:    v_mov_b32_e32 v4, s1
+; GFX6-NEXT:    v_mov_b32_e32 v3, s2
+; GFX6-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX6-NEXT:    v_and_b32_e32 v11, 0x7f, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v1, v3, vcc
@@ -5839,13 +5824,12 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX8-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX8-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX8-NEXT:    s_mov_b32 s1, 0
 ; GFX8-NEXT:    v_bfi_b32 v7, v0, 0, v1
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-NEXT:    s_or_b32 s2, s2, s0
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 64, v7
 ; GFX8-NEXT:    v_not_b32_e32 v8, 63
 ; GFX8-NEXT:    v_lshrrev_b64 v[1:2], v1, s[8:9]
-; GFX8-NEXT:    v_lshlrev_b64 v[3:4], v7, s[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[3:4], v7, s[2:3]
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v7, v8
 ; GFX8-NEXT:    v_lshlrev_b64 v[5:6], v7, s[8:9]
 ; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
@@ -5856,8 +5840,8 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v3, s0
-; GFX8-NEXT:    v_mov_b32_e32 v4, s1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s2
+; GFX8-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX8-NEXT:    v_and_b32_e32 v11, 0x7f, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v1, v3, vcc
@@ -5892,12 +5876,11 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX9-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX9-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX9-NEXT:    s_mov_b32 s1, 0
 ; GFX9-NEXT:    v_bfi_b32 v7, v0, 0, v1
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX9-NEXT:    s_or_b32 s2, s2, s0
 ; GFX9-NEXT:    v_sub_u32_e32 v1, 64, v7
 ; GFX9-NEXT:    v_lshrrev_b64 v[1:2], v1, s[8:9]
-; GFX9-NEXT:    v_lshlrev_b64 v[3:4], v7, s[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[3:4], v7, s[2:3]
 ; GFX9-NEXT:    v_add_u32_e32 v8, 0xffffffc0, v7
 ; GFX9-NEXT:    v_lshlrev_b64 v[5:6], v7, s[8:9]
 ; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
@@ -5908,10 +5891,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX9-NEXT:    v_and_b32_e32 v10, 0x7f, v0
-; GFX9-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v2, v4, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v10
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v1, v3, vcc
@@ -5941,34 +5924,33 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX10-LABEL: v_fshr_i128_ssv:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_bfi_b32 v11, v0, 0, 0x7f
-; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT:    s_lshr_b32 s8, s1, 31
-; GFX10-NEXT:    s_mov_b32 s9, 0
+; GFX10-NEXT:    s_lshl_b64 s[8:9], s[2:3], 1
+; GFX10-NEXT:    s_lshr_b32 s2, s1, 31
 ; GFX10-NEXT:    v_and_b32_e32 v12, 0x7f, v0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 64, v11
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    s_or_b64 s[8:9], s[2:3], s[8:9]
+; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 64, v11
+; GFX10-NEXT:    s_or_b32 s8, s8, s2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0xffffffc0, v11
 ; GFX10-NEXT:    v_lshlrev_b64 v[3:4], v11, s[8:9]
-; GFX10-NEXT:    v_lshrrev_b64 v[1:2], v1, s[0:1]
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v9, 64, v12
+; GFX10-NEXT:    v_lshrrev_b64 v[1:2], v1, s[0:1]
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v11
 ; GFX10-NEXT:    v_add_nc_u32_e32 v13, 0xffffffc0, v12
 ; GFX10-NEXT:    v_lshrrev_b64 v[7:8], v12, s[4:5]
+; GFX10-NEXT:    v_lshlrev_b64 v[9:10], v9, s[6:7]
 ; GFX10-NEXT:    v_lshlrev_b64 v[5:6], v11, s[0:1]
 ; GFX10-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, s[0:1]
-; GFX10-NEXT:    v_lshlrev_b64 v[9:10], v9, s[6:7]
 ; GFX10-NEXT:    v_or_b32_e32 v4, v2, v4
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s1, 64, v12
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 0, v11
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 0, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v14, v0, v3, vcc_lo
 ; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v13, s[6:7]
 ; GFX10-NEXT:    v_or_b32_e32 v0, v7, v9
 ; GFX10-NEXT:    v_or_b32_e32 v7, v8, v10
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v1, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s1
 ; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v12, s[6:7]
@@ -5988,18 +5970,18 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX11-LABEL: v_fshr_i128_ssv:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_bfi_b32 v11, v0, 0, 0x7f
-; GFX11-NEXT:    s_lshr_b32 s8, s1, 31
+; GFX11-NEXT:    s_lshl_b64 s[8:9], s[2:3], 1
+; GFX11-NEXT:    s_lshr_b32 s2, s1, 31
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT:    s_mov_b32 s9, 0
+; GFX11-NEXT:    s_or_b32 s8, s8, s2
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 64, v11
 ; GFX11-NEXT:    v_lshlrev_b64 v[5:6], v11, s[0:1]
 ; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v11
 ; GFX11-NEXT:    v_and_b32_e32 v12, 0x7f, v0
-; GFX11-NEXT:    s_or_b64 s[8:9], s[2:3], s[8:9]
-; GFX11-NEXT:    v_lshrrev_b64 v[1:2], v1, s[0:1]
 ; GFX11-NEXT:    v_lshlrev_b64 v[3:4], v11, s[8:9]
+; GFX11-NEXT:    v_lshrrev_b64 v[1:2], v1, s[0:1]
 ; GFX11-NEXT:    v_dual_cndmask_b32 v5, 0, v5 :: v_dual_add_nc_u32 v0, 0xffffffc0, v11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v9, 64, v12
 ; GFX11-NEXT:    v_lshrrev_b64 v[7:8], v12, s[4:5]
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 0, v12
@@ -6045,26 +6027,25 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX6-NEXT:    s_lshl_b64 s[6:7], s[0:1], 1
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX6-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX6-NEXT:    s_mov_b32 s1, 0
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX6-NEXT:    s_andn2_b32 s2, 0x7f, s4
+; GFX6-NEXT:    s_or_b32 s2, s2, s0
+; GFX6-NEXT:    s_andn2_b32 s0, 0x7f, s4
 ; GFX6-NEXT:    s_not_b32 s5, s4
-; GFX6-NEXT:    s_sub_i32 s12, s2, 64
-; GFX6-NEXT:    s_sub_i32 s8, 64, s2
-; GFX6-NEXT:    s_cmp_lt_u32 s2, 64
+; GFX6-NEXT:    s_sub_i32 s12, s0, 64
+; GFX6-NEXT:    s_sub_i32 s8, 64, s0
+; GFX6-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX6-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX6-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX6-NEXT:    s_lshr_b64 s[8:9], s[6:7], s8
-; GFX6-NEXT:    s_lshl_b64 s[10:11], s[0:1], s5
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[6:7], s5
+; GFX6-NEXT:    s_lshl_b64 s[10:11], s[2:3], s5
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[6:7], s5
 ; GFX6-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
 ; GFX6-NEXT:    s_lshl_b64 s[6:7], s[6:7], s12
 ; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX6-NEXT:    s_cselect_b64 s[6:7], s[8:9], s[6:7]
+; GFX6-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[6:7]
 ; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
-; GFX6-NEXT:    s_cselect_b64 s[6:7], s[0:1], s[6:7]
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX6-NEXT:    s_and_b32 s0, s4, 0x7f
 ; GFX6-NEXT:    s_sub_i32 s1, s0, 64
 ; GFX6-NEXT:    s_sub_i32 s4, 64, s0
@@ -6073,14 +6054,14 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX6-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s0
 ; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s4
-; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], s0
 ; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], s1
 ; GFX6-NEXT:    s_and_b32 s0, 1, s5
 ; GFX6-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX6-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX6-NEXT:    s_and_b32 s0, 1, s8
+; GFX6-NEXT:    s_and_b32 s0, 1, s6
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
@@ -6088,10 +6069,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX6-NEXT:    v_or_b32_e32 v0, s2, v0
-; GFX6-NEXT:    v_or_b32_e32 v1, s3, v1
-; GFX6-NEXT:    v_or_b32_e32 v2, s6, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, s7, v3
+; GFX6-NEXT:    v_or_b32_e32 v0, s10, v0
+; GFX6-NEXT:    v_or_b32_e32 v1, s11, v1
+; GFX6-NEXT:    v_or_b32_e32 v2, s2, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, s3, v3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_fshr_i128_svs:
@@ -6099,26 +6080,25 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], s[0:1], 1
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX8-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX8-NEXT:    s_mov_b32 s1, 0
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX8-NEXT:    s_andn2_b32 s2, 0x7f, s4
+; GFX8-NEXT:    s_or_b32 s2, s2, s0
+; GFX8-NEXT:    s_andn2_b32 s0, 0x7f, s4
 ; GFX8-NEXT:    s_not_b32 s5, s4
-; GFX8-NEXT:    s_sub_i32 s12, s2, 64
-; GFX8-NEXT:    s_sub_i32 s8, 64, s2
-; GFX8-NEXT:    s_cmp_lt_u32 s2, 64
+; GFX8-NEXT:    s_sub_i32 s12, s0, 64
+; GFX8-NEXT:    s_sub_i32 s8, 64, s0
+; GFX8-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX8-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX8-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX8-NEXT:    s_lshr_b64 s[8:9], s[6:7], s8
-; GFX8-NEXT:    s_lshl_b64 s[10:11], s[0:1], s5
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[6:7], s5
+; GFX8-NEXT:    s_lshl_b64 s[10:11], s[2:3], s5
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[6:7], s5
 ; GFX8-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], s[6:7], s12
 ; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX8-NEXT:    s_cselect_b64 s[6:7], s[8:9], s[6:7]
+; GFX8-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[6:7]
 ; GFX8-NEXT:    s_cmp_lg_u32 s14, 0
-; GFX8-NEXT:    s_cselect_b64 s[6:7], s[0:1], s[6:7]
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX8-NEXT:    s_and_b32 s0, s4, 0x7f
 ; GFX8-NEXT:    s_sub_i32 s1, s0, 64
 ; GFX8-NEXT:    s_sub_i32 s4, 64, s0
@@ -6127,14 +6107,14 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX8-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s0, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
-; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX8-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], s1, v[2:3]
 ; GFX8-NEXT:    s_and_b32 s0, 1, s5
 ; GFX8-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX8-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT:    s_and_b32 s0, 1, s8
+; GFX8-NEXT:    s_and_b32 s0, 1, s6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
@@ -6142,10 +6122,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX8-NEXT:    v_or_b32_e32 v0, s2, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, s3, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, s6, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, s7, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, s10, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, s11, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, s2, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, s3, v3
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_fshr_i128_svs:
@@ -6153,26 +6133,25 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], s[0:1], 1
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX9-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX9-NEXT:    s_mov_b32 s1, 0
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX9-NEXT:    s_andn2_b32 s2, 0x7f, s4
+; GFX9-NEXT:    s_or_b32 s2, s2, s0
+; GFX9-NEXT:    s_andn2_b32 s0, 0x7f, s4
 ; GFX9-NEXT:    s_not_b32 s5, s4
-; GFX9-NEXT:    s_sub_i32 s12, s2, 64
-; GFX9-NEXT:    s_sub_i32 s8, 64, s2
-; GFX9-NEXT:    s_cmp_lt_u32 s2, 64
+; GFX9-NEXT:    s_sub_i32 s12, s0, 64
+; GFX9-NEXT:    s_sub_i32 s8, 64, s0
+; GFX9-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX9-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX9-NEXT:    s_lshr_b64 s[8:9], s[6:7], s8
-; GFX9-NEXT:    s_lshl_b64 s[10:11], s[0:1], s5
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[6:7], s5
+; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s5
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[6:7], s5
 ; GFX9-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], s[6:7], s12
 ; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9-NEXT:    s_cselect_b64 s[6:7], s[8:9], s[6:7]
+; GFX9-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[6:7]
 ; GFX9-NEXT:    s_cmp_lg_u32 s14, 0
-; GFX9-NEXT:    s_cselect_b64 s[6:7], s[0:1], s[6:7]
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX9-NEXT:    s_and_b32 s0, s4, 0x7f
 ; GFX9-NEXT:    s_sub_i32 s1, s0, 64
 ; GFX9-NEXT:    s_sub_i32 s4, 64, s0
@@ -6181,14 +6160,14 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s0, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
-; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX9-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], s1, v[2:3]
 ; GFX9-NEXT:    s_and_b32 s0, 1, s5
 ; GFX9-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX9-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX9-NEXT:    s_and_b32 s0, 1, s8
+; GFX9-NEXT:    s_and_b32 s0, 1, s6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
@@ -6196,20 +6175,19 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX9-NEXT:    v_or_b32_e32 v0, s2, v0
-; GFX9-NEXT:    v_or_b32_e32 v1, s3, v1
-; GFX9-NEXT:    v_or_b32_e32 v2, s6, v2
-; GFX9-NEXT:    v_or_b32_e32 v3, s7, v3
+; GFX9-NEXT:    v_or_b32_e32 v0, s10, v0
+; GFX9-NEXT:    v_or_b32_e32 v1, s11, v1
+; GFX9-NEXT:    v_or_b32_e32 v2, s2, v2
+; GFX9-NEXT:    v_or_b32_e32 v3, s3, v3
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_fshr_i128_svs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT:    s_lshr_b32 s6, s1, 31
-; GFX10-NEXT:    s_mov_b32 s7, 0
-; GFX10-NEXT:    s_andn2_b32 s5, 0x7f, s4
+; GFX10-NEXT:    s_lshr_b32 s5, s1, 31
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
+; GFX10-NEXT:    s_or_b32 s2, s2, s5
+; GFX10-NEXT:    s_andn2_b32 s5, 0x7f, s4
 ; GFX10-NEXT:    s_not_b32 s10, s4
 ; GFX10-NEXT:    s_sub_i32 s12, s5, 64
 ; GFX10-NEXT:    s_sub_i32 s6, 64, s5
@@ -6259,11 +6237,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX11-LABEL: v_fshr_i128_svs:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT:    s_lshr_b32 s6, s1, 31
-; GFX11-NEXT:    s_mov_b32 s7, 0
-; GFX11-NEXT:    s_and_not1_b32 s5, 0x7f, s4
+; GFX11-NEXT:    s_lshr_b32 s5, s1, 31
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
+; GFX11-NEXT:    s_or_b32 s2, s2, s5
+; GFX11-NEXT:    s_and_not1_b32 s5, 0x7f, s4
 ; GFX11-NEXT:    s_not_b32 s10, s4
 ; GFX11-NEXT:    s_sub_i32 s12, s5, 64
 ; GFX11-NEXT:    s_sub_i32 s6, 64, s5
@@ -6714,81 +6691,80 @@ define i128 @v_fshr_i128_65(i128 %lhs, i128 %rhs) {
 define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs, <2 x i128> inreg %amt) {
 ; GFX6-LABEL: s_fshr_v2i128:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX6-NEXT:    s_lshr_b32 s22, s1, 31
-; GFX6-NEXT:    s_mov_b32 s23, 0
 ; GFX6-NEXT:    s_lshl_b64 s[18:19], s[0:1], 1
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[22:23]
-; GFX6-NEXT:    s_andn2_b32 s2, 0x7f, s16
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
+; GFX6-NEXT:    s_lshr_b32 s0, s1, 31
+; GFX6-NEXT:    s_or_b32 s2, s2, s0
+; GFX6-NEXT:    s_andn2_b32 s0, 0x7f, s16
 ; GFX6-NEXT:    s_not_b32 s17, s16
-; GFX6-NEXT:    s_sub_i32 s21, s2, 64
-; GFX6-NEXT:    s_sub_i32 s22, 64, s2
-; GFX6-NEXT:    s_cmp_lt_u32 s2, 64
-; GFX6-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s2, 0
-; GFX6-NEXT:    s_cselect_b32 s29, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[24:25], s[18:19], s22
-; GFX6-NEXT:    s_lshl_b64 s[26:27], s[0:1], s17
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[18:19], s17
-; GFX6-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
-; GFX6-NEXT:    s_lshl_b64 s[18:19], s[18:19], s21
-; GFX6-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX6-NEXT:    s_cselect_b64 s[18:19], s[24:25], s[18:19]
-; GFX6-NEXT:    s_cmp_lg_u32 s29, 0
-; GFX6-NEXT:    s_cselect_b64 s[18:19], s[0:1], s[18:19]
-; GFX6-NEXT:    s_and_b32 s0, s16, 0x7f
 ; GFX6-NEXT:    s_sub_i32 s21, s0, 64
 ; GFX6-NEXT:    s_sub_i32 s22, 64, s0
 ; GFX6-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX6-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX6-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[0:1], s[10:11], s16
+; GFX6-NEXT:    s_lshr_b64 s[22:23], s[18:19], s22
+; GFX6-NEXT:    s_lshl_b64 s[24:25], s[2:3], s17
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[18:19], s17
+; GFX6-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
+; GFX6-NEXT:    s_lshl_b64 s[18:19], s[18:19], s21
+; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
+; GFX6-NEXT:    s_cselect_b64 s[18:19], s[22:23], s[18:19]
+; GFX6-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[18:19]
+; GFX6-NEXT:    s_and_b32 s17, s16, 0x7f
+; GFX6-NEXT:    s_sub_i32 s21, s17, 64
+; GFX6-NEXT:    s_sub_i32 s22, 64, s17
+; GFX6-NEXT:    s_cmp_lt_u32 s17, 64
+; GFX6-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s17, 0
+; GFX6-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[18:19], s[10:11], s16
 ; GFX6-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
-; GFX6-NEXT:    s_lshl_b64 s[24:25], s[10:11], s22
-; GFX6-NEXT:    s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX6-NEXT:    s_lshl_b64 s[22:23], s[10:11], s22
+; GFX6-NEXT:    s_or_b64 s[16:17], s[16:17], s[22:23]
 ; GFX6-NEXT:    s_lshr_b64 s[10:11], s[10:11], s21
-; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX6-NEXT:    s_cselect_b64 s[10:11], s[16:17], s[10:11]
-; GFX6-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s25, 0
 ; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
-; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX6-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
-; GFX6-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GFX6-NEXT:    s_lshr_b32 s22, s5, 31
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
+; GFX6-NEXT:    s_cmp_lg_u32 s24, 0
+; GFX6-NEXT:    s_cselect_b64 s[10:11], s[18:19], 0
+; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX6-NEXT:    s_lshl_b64 s[8:9], s[4:5], 1
-; GFX6-NEXT:    s_or_b64 s[4:5], s[6:7], s[22:23]
-; GFX6-NEXT:    s_andn2_b32 s6, 0x7f, s20
-; GFX6-NEXT:    s_or_b64 s[2:3], s[18:19], s[10:11]
+; GFX6-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
+; GFX6-NEXT:    s_lshr_b32 s4, s5, 31
+; GFX6-NEXT:    s_or_b32 s6, s6, s4
+; GFX6-NEXT:    s_andn2_b32 s4, 0x7f, s20
+; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[10:11]
 ; GFX6-NEXT:    s_not_b32 s16, s20
-; GFX6-NEXT:    s_sub_i32 s18, s6, 64
-; GFX6-NEXT:    s_sub_i32 s10, 64, s6
-; GFX6-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX6-NEXT:    s_sub_i32 s18, s4, 64
+; GFX6-NEXT:    s_sub_i32 s10, 64, s4
+; GFX6-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX6-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX6-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[6:7], s[8:9], s16
+; GFX6-NEXT:    s_lshl_b64 s[4:5], s[8:9], s16
 ; GFX6-NEXT:    s_lshr_b64 s[10:11], s[8:9], s10
-; GFX6-NEXT:    s_lshl_b64 s[16:17], s[4:5], s16
+; GFX6-NEXT:    s_lshl_b64 s[16:17], s[6:7], s16
 ; GFX6-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX6-NEXT:    s_lshl_b64 s[8:9], s[8:9], s18
 ; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX6-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
+; GFX6-NEXT:    s_cselect_b64 s[4:5], s[4:5], 0
 ; GFX6-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
 ; GFX6-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX6-NEXT:    s_cselect_b64 s[8:9], s[4:5], s[8:9]
-; GFX6-NEXT:    s_and_b32 s4, s20, 0x7f
-; GFX6-NEXT:    s_sub_i32 s18, s4, 64
-; GFX6-NEXT:    s_sub_i32 s16, 64, s4
-; GFX6-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX6-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[8:9]
+; GFX6-NEXT:    s_and_b32 s8, s20, 0x7f
+; GFX6-NEXT:    s_sub_i32 s18, s8, 64
+; GFX6-NEXT:    s_sub_i32 s16, 64, s8
+; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX6-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX6-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX6-NEXT:    s_lshr_b64 s[10:11], s[12:13], s20
 ; GFX6-NEXT:    s_lshl_b64 s[16:17], s[14:15], s16
-; GFX6-NEXT:    s_lshr_b64 s[4:5], s[14:15], s20
+; GFX6-NEXT:    s_lshr_b64 s[8:9], s[14:15], s20
 ; GFX6-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX6-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
 ; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
@@ -6796,88 +6772,87 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX6-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX6-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
 ; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX6-NEXT:    s_cselect_b64 s[12:13], s[4:5], 0
-; GFX6-NEXT:    s_or_b64 s[4:5], s[6:7], s[10:11]
-; GFX6-NEXT:    s_or_b64 s[6:7], s[8:9], s[12:13]
+; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX6-NEXT:    s_or_b64 s[4:5], s[4:5], s[10:11]
+; GFX6-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fshr_v2i128:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX8-NEXT:    s_lshr_b32 s22, s1, 31
-; GFX8-NEXT:    s_mov_b32 s23, 0
 ; GFX8-NEXT:    s_lshl_b64 s[18:19], s[0:1], 1
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[22:23]
-; GFX8-NEXT:    s_andn2_b32 s2, 0x7f, s16
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
+; GFX8-NEXT:    s_lshr_b32 s0, s1, 31
+; GFX8-NEXT:    s_or_b32 s2, s2, s0
+; GFX8-NEXT:    s_andn2_b32 s0, 0x7f, s16
 ; GFX8-NEXT:    s_not_b32 s17, s16
-; GFX8-NEXT:    s_sub_i32 s21, s2, 64
-; GFX8-NEXT:    s_sub_i32 s22, 64, s2
-; GFX8-NEXT:    s_cmp_lt_u32 s2, 64
-; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s2, 0
-; GFX8-NEXT:    s_cselect_b32 s29, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[24:25], s[18:19], s22
-; GFX8-NEXT:    s_lshl_b64 s[26:27], s[0:1], s17
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[18:19], s17
-; GFX8-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
-; GFX8-NEXT:    s_lshl_b64 s[18:19], s[18:19], s21
-; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX8-NEXT:    s_cselect_b64 s[18:19], s[24:25], s[18:19]
-; GFX8-NEXT:    s_cmp_lg_u32 s29, 0
-; GFX8-NEXT:    s_cselect_b64 s[18:19], s[0:1], s[18:19]
-; GFX8-NEXT:    s_and_b32 s0, s16, 0x7f
 ; GFX8-NEXT:    s_sub_i32 s21, s0, 64
 ; GFX8-NEXT:    s_sub_i32 s22, 64, s0
 ; GFX8-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX8-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX8-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[0:1], s[10:11], s16
+; GFX8-NEXT:    s_lshr_b64 s[22:23], s[18:19], s22
+; GFX8-NEXT:    s_lshl_b64 s[24:25], s[2:3], s17
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[18:19], s17
+; GFX8-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
+; GFX8-NEXT:    s_lshl_b64 s[18:19], s[18:19], s21
+; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
+; GFX8-NEXT:    s_cselect_b64 s[18:19], s[22:23], s[18:19]
+; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[18:19]
+; GFX8-NEXT:    s_and_b32 s17, s16, 0x7f
+; GFX8-NEXT:    s_sub_i32 s21, s17, 64
+; GFX8-NEXT:    s_sub_i32 s22, 64, s17
+; GFX8-NEXT:    s_cmp_lt_u32 s17, 64
+; GFX8-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s17, 0
+; GFX8-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[18:19], s[10:11], s16
 ; GFX8-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
-; GFX8-NEXT:    s_lshl_b64 s[24:25], s[10:11], s22
-; GFX8-NEXT:    s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX8-NEXT:    s_lshl_b64 s[22:23], s[10:11], s22
+; GFX8-NEXT:    s_or_b64 s[16:17], s[16:17], s[22:23]
 ; GFX8-NEXT:    s_lshr_b64 s[10:11], s[10:11], s21
-; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX8-NEXT:    s_cselect_b64 s[10:11], s[16:17], s[10:11]
-; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s25, 0
 ; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
-; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX8-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
-; GFX8-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GFX8-NEXT:    s_lshr_b32 s22, s5, 31
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
+; GFX8-NEXT:    s_cmp_lg_u32 s24, 0
+; GFX8-NEXT:    s_cselect_b64 s[10:11], s[18:19], 0
+; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX8-NEXT:    s_lshl_b64 s[8:9], s[4:5], 1
-; GFX8-NEXT:    s_or_b64 s[4:5], s[6:7], s[22:23]
-; GFX8-NEXT:    s_andn2_b32 s6, 0x7f, s20
-; GFX8-NEXT:    s_or_b64 s[2:3], s[18:19], s[10:11]
+; GFX8-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
+; GFX8-NEXT:    s_lshr_b32 s4, s5, 31
+; GFX8-NEXT:    s_or_b32 s6, s6, s4
+; GFX8-NEXT:    s_andn2_b32 s4, 0x7f, s20
+; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[10:11]
 ; GFX8-NEXT:    s_not_b32 s16, s20
-; GFX8-NEXT:    s_sub_i32 s18, s6, 64
-; GFX8-NEXT:    s_sub_i32 s10, 64, s6
-; GFX8-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX8-NEXT:    s_sub_i32 s18, s4, 64
+; GFX8-NEXT:    s_sub_i32 s10, 64, s4
+; GFX8-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX8-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[6:7], s[8:9], s16
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[8:9], s16
 ; GFX8-NEXT:    s_lshr_b64 s[10:11], s[8:9], s10
-; GFX8-NEXT:    s_lshl_b64 s[16:17], s[4:5], s16
+; GFX8-NEXT:    s_lshl_b64 s[16:17], s[6:7], s16
 ; GFX8-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX8-NEXT:    s_lshl_b64 s[8:9], s[8:9], s18
 ; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX8-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
+; GFX8-NEXT:    s_cselect_b64 s[4:5], s[4:5], 0
 ; GFX8-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
 ; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX8-NEXT:    s_cselect_b64 s[8:9], s[4:5], s[8:9]
-; GFX8-NEXT:    s_and_b32 s4, s20, 0x7f
-; GFX8-NEXT:    s_sub_i32 s18, s4, 64
-; GFX8-NEXT:    s_sub_i32 s16, 64, s4
-; GFX8-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX8-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[8:9]
+; GFX8-NEXT:    s_and_b32 s8, s20, 0x7f
+; GFX8-NEXT:    s_sub_i32 s18, s8, 64
+; GFX8-NEXT:    s_sub_i32 s16, 64, s8
+; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX8-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX8-NEXT:    s_lshr_b64 s[10:11], s[12:13], s20
 ; GFX8-NEXT:    s_lshl_b64 s[16:17], s[14:15], s16
-; GFX8-NEXT:    s_lshr_b64 s[4:5], s[14:15], s20
+; GFX8-NEXT:    s_lshr_b64 s[8:9], s[14:15], s20
 ; GFX8-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX8-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
 ; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
@@ -6885,88 +6860,87 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX8-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
 ; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX8-NEXT:    s_cselect_b64 s[12:13], s[4:5], 0
-; GFX8-NEXT:    s_or_b64 s[4:5], s[6:7], s[10:11]
-; GFX8-NEXT:    s_or_b64 s[6:7], s[8:9], s[12:13]
+; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX8-NEXT:    s_or_b64 s[4:5], s[4:5], s[10:11]
+; GFX8-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fshr_v2i128:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX9-NEXT:    s_lshr_b32 s22, s1, 31
-; GFX9-NEXT:    s_mov_b32 s23, 0
 ; GFX9-NEXT:    s_lshl_b64 s[18:19], s[0:1], 1
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[22:23]
-; GFX9-NEXT:    s_andn2_b32 s2, 0x7f, s16
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
+; GFX9-NEXT:    s_lshr_b32 s0, s1, 31
+; GFX9-NEXT:    s_or_b32 s2, s2, s0
+; GFX9-NEXT:    s_andn2_b32 s0, 0x7f, s16
 ; GFX9-NEXT:    s_not_b32 s17, s16
-; GFX9-NEXT:    s_sub_i32 s21, s2, 64
-; GFX9-NEXT:    s_sub_i32 s22, 64, s2
-; GFX9-NEXT:    s_cmp_lt_u32 s2, 64
-; GFX9-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s2, 0
-; GFX9-NEXT:    s_cselect_b32 s29, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[24:25], s[18:19], s22
-; GFX9-NEXT:    s_lshl_b64 s[26:27], s[0:1], s17
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[18:19], s17
-; GFX9-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
-; GFX9-NEXT:    s_lshl_b64 s[18:19], s[18:19], s21
-; GFX9-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9-NEXT:    s_cselect_b64 s[18:19], s[24:25], s[18:19]
-; GFX9-NEXT:    s_cmp_lg_u32 s29, 0
-; GFX9-NEXT:    s_cselect_b64 s[18:19], s[0:1], s[18:19]
-; GFX9-NEXT:    s_and_b32 s0, s16, 0x7f
 ; GFX9-NEXT:    s_sub_i32 s21, s0, 64
 ; GFX9-NEXT:    s_sub_i32 s22, 64, s0
 ; GFX9-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX9-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX9-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[0:1], s[10:11], s16
+; GFX9-NEXT:    s_lshr_b64 s[22:23], s[18:19], s22
+; GFX9-NEXT:    s_lshl_b64 s[24:25], s[2:3], s17
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[18:19], s17
+; GFX9-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
+; GFX9-NEXT:    s_lshl_b64 s[18:19], s[18:19], s21
+; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
+; GFX9-NEXT:    s_cselect_b64 s[18:19], s[22:23], s[18:19]
+; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[18:19]
+; GFX9-NEXT:    s_and_b32 s17, s16, 0x7f
+; GFX9-NEXT:    s_sub_i32 s21, s17, 64
+; GFX9-NEXT:    s_sub_i32 s22, 64, s17
+; GFX9-NEXT:    s_cmp_lt_u32 s17, 64
+; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s17, 0
+; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[18:19], s[10:11], s16
 ; GFX9-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
-; GFX9-NEXT:    s_lshl_b64 s[24:25], s[10:11], s22
-; GFX9-NEXT:    s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX9-NEXT:    s_lshl_b64 s[22:23], s[10:11], s22
+; GFX9-NEXT:    s_or_b64 s[16:17], s[16:17], s[22:23]
 ; GFX9-NEXT:    s_lshr_b64 s[10:11], s[10:11], s21
-; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], s[16:17], s[10:11]
-; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s25, 0
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
-; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX9-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
-; GFX9-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GFX9-NEXT:    s_lshr_b32 s22, s5, 31
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
+; GFX9-NEXT:    s_cmp_lg_u32 s24, 0
+; GFX9-NEXT:    s_cselect_b64 s[10:11], s[18:19], 0
+; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX9-NEXT:    s_lshl_b64 s[8:9], s[4:5], 1
-; GFX9-NEXT:    s_or_b64 s[4:5], s[6:7], s[22:23]
-; GFX9-NEXT:    s_andn2_b32 s6, 0x7f, s20
-; GFX9-NEXT:    s_or_b64 s[2:3], s[18:19], s[10:11]
+; GFX9-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
+; GFX9-NEXT:    s_lshr_b32 s4, s5, 31
+; GFX9-NEXT:    s_or_b32 s6, s6, s4
+; GFX9-NEXT:    s_andn2_b32 s4, 0x7f, s20
+; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[10:11]
 ; GFX9-NEXT:    s_not_b32 s16, s20
-; GFX9-NEXT:    s_sub_i32 s18, s6, 64
-; GFX9-NEXT:    s_sub_i32 s10, 64, s6
-; GFX9-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX9-NEXT:    s_sub_i32 s18, s4, 64
+; GFX9-NEXT:    s_sub_i32 s10, 64, s4
+; GFX9-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX9-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[6:7], s[8:9], s16
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[8:9], s16
 ; GFX9-NEXT:    s_lshr_b64 s[10:11], s[8:9], s10
-; GFX9-NEXT:    s_lshl_b64 s[16:17], s[4:5], s16
+; GFX9-NEXT:    s_lshl_b64 s[16:17], s[6:7], s16
 ; GFX9-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX9-NEXT:    s_lshl_b64 s[8:9], s[8:9], s18
 ; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX9-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
+; GFX9-NEXT:    s_cselect_b64 s[4:5], s[4:5], 0
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
 ; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX9-NEXT:    s_cselect_b64 s[8:9], s[4:5], s[8:9]
-; GFX9-NEXT:    s_and_b32 s4, s20, 0x7f
-; GFX9-NEXT:    s_sub_i32 s18, s4, 64
-; GFX9-NEXT:    s_sub_i32 s16, 64, s4
-; GFX9-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX9-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[8:9]
+; GFX9-NEXT:    s_and_b32 s8, s20, 0x7f
+; GFX9-NEXT:    s_sub_i32 s18, s8, 64
+; GFX9-NEXT:    s_sub_i32 s16, 64, s8
+; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX9-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX9-NEXT:    s_lshr_b64 s[10:11], s[12:13], s20
 ; GFX9-NEXT:    s_lshl_b64 s[16:17], s[14:15], s16
-; GFX9-NEXT:    s_lshr_b64 s[4:5], s[14:15], s20
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[14:15], s20
 ; GFX9-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX9-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
 ; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
@@ -6974,61 +6948,60 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
 ; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX9-NEXT:    s_cselect_b64 s[12:13], s[4:5], 0
-; GFX9-NEXT:    s_or_b64 s[4:5], s[6:7], s[10:11]
-; GFX9-NEXT:    s_or_b64 s[6:7], s[8:9], s[12:13]
+; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], s[10:11]
+; GFX9-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fshr_v2i128:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT:    s_lshr_b32 s18, s1, 31
-; GFX10-NEXT:    s_mov_b32 s19, 0
-; GFX10-NEXT:    s_andn2_b32 s17, 0x7f, s16
+; GFX10-NEXT:    s_lshr_b32 s17, s1, 31
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[18:19]
-; GFX10-NEXT:    s_not_b32 s18, s16
-; GFX10-NEXT:    s_sub_i32 s21, s17, 64
-; GFX10-NEXT:    s_sub_i32 s22, 64, s17
+; GFX10-NEXT:    s_or_b32 s2, s2, s17
+; GFX10-NEXT:    s_andn2_b32 s17, 0x7f, s16
+; GFX10-NEXT:    s_not_b32 s21, s16
+; GFX10-NEXT:    s_sub_i32 s26, s17, 64
+; GFX10-NEXT:    s_sub_i32 s18, 64, s17
 ; GFX10-NEXT:    s_cmp_lt_u32 s17, 64
-; GFX10-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s27, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s17, 0
 ; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[22:23], s[0:1], s22
-; GFX10-NEXT:    s_lshl_b64 s[24:25], s[2:3], s18
-; GFX10-NEXT:    s_lshl_b64 s[26:27], s[0:1], s18
-; GFX10-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
-; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s21
-; GFX10-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX10-NEXT:    s_cselect_b64 s[24:25], s[26:27], 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[22:23], s[0:1]
+; GFX10-NEXT:    s_lshr_b64 s[18:19], s[0:1], s18
+; GFX10-NEXT:    s_lshl_b64 s[22:23], s[2:3], s21
+; GFX10-NEXT:    s_lshl_b64 s[24:25], s[0:1], s21
+; GFX10-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s26
+; GFX10-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX10-NEXT:    s_cselect_b64 s[22:23], s[24:25], 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[18:19], s[0:1]
 ; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX10-NEXT:    s_and_b32 s0, s16, 0x7f
-; GFX10-NEXT:    s_sub_i32 s18, s0, 64
+; GFX10-NEXT:    s_sub_i32 s21, s0, 64
 ; GFX10-NEXT:    s_sub_i32 s17, 64, s0
 ; GFX10-NEXT:    s_cmp_lt_u32 s0, 64
-; GFX10-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s0, 0
-; GFX10-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s25, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[0:1], s[8:9], s16
-; GFX10-NEXT:    s_lshl_b64 s[22:23], s[10:11], s17
+; GFX10-NEXT:    s_lshl_b64 s[18:19], s[10:11], s17
 ; GFX10-NEXT:    s_lshr_b64 s[16:17], s[10:11], s16
-; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
-; GFX10-NEXT:    s_lshr_b64 s[10:11], s[10:11], s18
-; GFX10-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[18:19]
+; GFX10-NEXT:    s_lshr_b64 s[10:11], s[10:11], s21
+; GFX10-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
-; GFX10-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s25, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
-; GFX10-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX10-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
 ; GFX10-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX10-NEXT:    s_lshr_b32 s18, s5, 31
+; GFX10-NEXT:    s_lshr_b32 s8, s5, 31
+; GFX10-NEXT:    s_or_b64 s[0:1], s[22:23], s[0:1]
+; GFX10-NEXT:    s_or_b32 s6, s6, s8
 ; GFX10-NEXT:    s_andn2_b32 s8, 0x7f, s20
-; GFX10-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
 ; GFX10-NEXT:    s_lshl_b64 s[4:5], s[4:5], 1
-; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[18:19]
 ; GFX10-NEXT:    s_not_b32 s16, s20
 ; GFX10-NEXT:    s_sub_i32 s18, s8, 64
 ; GFX10-NEXT:    s_sub_i32 s9, 64, s8
@@ -7071,54 +7044,53 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX11-LABEL: s_fshr_v2i128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT:    s_lshr_b32 s18, s1, 31
-; GFX11-NEXT:    s_mov_b32 s19, 0
-; GFX11-NEXT:    s_and_not1_b32 s17, 0x7f, s16
+; GFX11-NEXT:    s_lshr_b32 s17, s1, 31
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[18:19]
-; GFX11-NEXT:    s_not_b32 s18, s16
-; GFX11-NEXT:    s_sub_i32 s21, s17, 64
-; GFX11-NEXT:    s_sub_i32 s22, 64, s17
+; GFX11-NEXT:    s_or_b32 s2, s2, s17
+; GFX11-NEXT:    s_and_not1_b32 s17, 0x7f, s16
+; GFX11-NEXT:    s_not_b32 s21, s16
+; GFX11-NEXT:    s_sub_i32 s26, s17, 64
+; GFX11-NEXT:    s_sub_i32 s18, 64, s17
 ; GFX11-NEXT:    s_cmp_lt_u32 s17, 64
-; GFX11-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s27, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s17, 0
 ; GFX11-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[22:23], s[0:1], s22
-; GFX11-NEXT:    s_lshl_b64 s[24:25], s[2:3], s18
-; GFX11-NEXT:    s_lshl_b64 s[26:27], s[0:1], s18
-; GFX11-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s21
-; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX11-NEXT:    s_cselect_b64 s[24:25], s[26:27], 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[22:23], s[0:1]
+; GFX11-NEXT:    s_lshr_b64 s[18:19], s[0:1], s18
+; GFX11-NEXT:    s_lshl_b64 s[22:23], s[2:3], s21
+; GFX11-NEXT:    s_lshl_b64 s[24:25], s[0:1], s21
+; GFX11-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s26
+; GFX11-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX11-NEXT:    s_cselect_b64 s[22:23], s[24:25], 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[18:19], s[0:1]
 ; GFX11-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX11-NEXT:    s_and_b32 s0, s16, 0x7f
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_sub_i32 s18, s0, 64
+; GFX11-NEXT:    s_sub_i32 s21, s0, 64
 ; GFX11-NEXT:    s_sub_i32 s17, 64, s0
 ; GFX11-NEXT:    s_cmp_lt_u32 s0, 64
-; GFX11-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s0, 0
-; GFX11-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s25, 1, 0
 ; GFX11-NEXT:    s_lshr_b64 s[0:1], s[8:9], s16
-; GFX11-NEXT:    s_lshl_b64 s[22:23], s[10:11], s17
+; GFX11-NEXT:    s_lshl_b64 s[18:19], s[10:11], s17
 ; GFX11-NEXT:    s_lshr_b64 s[16:17], s[10:11], s16
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
-; GFX11-NEXT:    s_lshr_b64 s[10:11], s[10:11], s18
-; GFX11-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[18:19]
+; GFX11-NEXT:    s_lshr_b64 s[10:11], s[10:11], s21
+; GFX11-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
-; GFX11-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s25, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
-; GFX11-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX11-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
 ; GFX11-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
 ; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX11-NEXT:    s_lshr_b32 s18, s5, 31
+; GFX11-NEXT:    s_lshr_b32 s8, s5, 31
+; GFX11-NEXT:    s_or_b64 s[0:1], s[22:23], s[0:1]
+; GFX11-NEXT:    s_or_b32 s6, s6, s8
 ; GFX11-NEXT:    s_and_not1_b32 s8, 0x7f, s20
-; GFX11-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
 ; GFX11-NEXT:    s_lshl_b64 s[4:5], s[4:5], 1
-; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[18:19]
 ; GFX11-NEXT:    s_not_b32 s16, s20
 ; GFX11-NEXT:    s_sub_i32 s18, s8, 64
 ; GFX11-NEXT:    s_sub_i32 s9, 64, s8
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaximum.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaximum.mir
new file mode 100644
index 0000000..4b214e6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaximum.mir
@@ -0,0 +1,275 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9  %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX12  %s
+
+---
+name: test_fmaximum_f16
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fmaximum_f16
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s16) = G_FMAXNUM_IEEE [[TRUNC]], [[TRUNC1]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[TRUNC]](s16), [[TRUNC1]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH7E00
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[FMAXNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s16) = COPY [[SELECT]](s16)
+    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY2]](s16)
+    ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fmaximum_f16
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s16) = G_FMAXIMUM [[TRUNC]], [[TRUNC1]]
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMAXIMUM]](s16)
+    ; GFX12-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s16) = G_TRUNC %0(s32)
+    %2:_(s32) = COPY $vgpr1
+    %3:_(s16) = G_TRUNC %2(s32)
+    %4:_(s16) = G_FMAXIMUM %1, %3
+    %5:_(s32) = G_ANYEXT %4(s16)
+    $vgpr0 = COPY %5(s32)
+    SI_RETURN implicit $vgpr0
+...
+---
+name: test_fmaximum_f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fmaximum_f32
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s32), [[COPY1]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x7FF8000000000000
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMAXNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fmaximum_f32
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s32) = G_FMAXIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMAXIMUM]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = G_FMAXIMUM %0, %1
+    $vgpr0 = COPY %2(s32)
+    SI_RETURN implicit $vgpr0
+...
+---
+name: test_fmaximum_f64
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+    ; GFX9-LABEL: name: test_fmaximum_f64
+    ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+    ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s64) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s64), [[COPY1]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x7FF8000000000000
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[FCMP]](s1), [[FMAXNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[SELECT]](s64)
+    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[COPY2]](s64)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0_vgpr1
+    ;
+    ; GFX12-LABEL: name: test_fmaximum_f64
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+    ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s64) = G_FMAXIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[FMAXIMUM]](s64)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0_vgpr1
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s64) = COPY $vgpr2_vgpr3
+    %2:_(s64) = G_FMAXIMUM %0, %1
+    $vgpr0_vgpr1 = COPY %2(s64)
+    SI_RETURN implicit $vgpr0_vgpr1
+...
+---
+name: test_fmaximum_v2f16
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fmaximum_v2f16
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[TRUNC]](s16), [[TRUNC2]]
+    ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[TRUNC1]](s16), [[TRUNC3]]
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH7E00
+    ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[FMAXNUM_IEEE]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+    ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[TRUNC4]], [[C1]]
+    ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[TRUNC5]], [[C1]]
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[SELECT]](s16), [[SELECT1]](s16)
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](<2 x s16>)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fmaximum_v2f16
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(<2 x s16>) = G_FMAXIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMAXIMUM]](<2 x s16>)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(<2 x s16>) = COPY $vgpr0
+    %1:_(<2 x s16>) = COPY $vgpr1
+    %2:_(<2 x s16>) = G_FMAXIMUM %0, %1
+    $vgpr0 = COPY %2(<2 x s16>)
+    SI_RETURN implicit $vgpr0
+...
+---
+name: test_fmaximum_v2f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+    ; GFX9-LABEL: name: test_fmaximum_v2f32
+    ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY]], [[COPY2]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s32), [[COPY2]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x7FF8000000000000
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMAXNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32)
+    ; GFX9-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY1]], [[COPY3]]
+    ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY1]](s32), [[COPY3]]
+    ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[FMAXNUM_IEEE1]], [[C]]
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY4]](s32)
+    ; GFX9-NEXT: $vgpr1 = COPY [[COPY5]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+    ;
+    ; GFX12-LABEL: name: test_fmaximum_v2f32
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s32) = G_FMAXIMUM [[COPY]], [[COPY2]]
+    ; GFX12-NEXT: [[FMAXIMUM1:%[0-9]+]]:_(s32) = G_FMAXIMUM [[COPY1]], [[COPY3]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMAXIMUM]](s32)
+    ; GFX12-NEXT: $vgpr1 = COPY [[FMAXIMUM1]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(<2 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32)
+    %3:_(s32) = COPY $vgpr2
+    %4:_(s32) = COPY $vgpr3
+    %5:_(<2 x s32>) = G_BUILD_VECTOR %3(s32), %4(s32)
+    %6:_(<2 x s32>) = G_FMAXIMUM %2, %5
+    %7:_(s32), %8:_(s32) = G_UNMERGE_VALUES %6(<2 x s32>)
+    $vgpr0 = COPY %7(s32)
+    $vgpr1 = COPY %8(s32)
+    SI_RETURN implicit $vgpr0, implicit $vgpr1
+...
+---
+name: test_fmaximum_nsz_f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fmaximum_nsz_f32
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s32), [[COPY1]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x7FF8000000000000
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMAXNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fmaximum_nsz_f32
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s32) = nsz G_FMAXIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMAXIMUM]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = nsz G_FMAXIMUM %0, %1
+    $vgpr0 = COPY %2(s32)
+    SI_RETURN implicit $vgpr0
+...
+---
+name: test_fmaximum_nnan_f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fmaximum_nnan_f32
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[FMAXNUM_IEEE]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fmaximum_nnan_f32
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s32) = nnan G_FMAXIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMAXIMUM]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = nnan G_FMAXIMUM %0, %1
+    $vgpr0 = COPY %2(s32)
+    SI_RETURN implicit $vgpr0
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminimum.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminimum.mir
new file mode 100644
index 0000000..8ba0794
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminimum.mir
@@ -0,0 +1,275 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9  %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX12  %s
+
+---
+name: test_fminimum_f16
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fminimum_f16
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s16) = G_FMINNUM_IEEE [[TRUNC]], [[TRUNC1]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[TRUNC]](s16), [[TRUNC1]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH7E00
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[FMINNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s16) = COPY [[SELECT]](s16)
+    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY2]](s16)
+    ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fminimum_f16
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX12-NEXT: [[FMINIMUM:%[0-9]+]]:_(s16) = G_FMINIMUM [[TRUNC]], [[TRUNC1]]
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMINIMUM]](s16)
+    ; GFX12-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s16) = G_TRUNC %0(s32)
+    %2:_(s32) = COPY $vgpr1
+    %3:_(s16) = G_TRUNC %2(s32)
+    %4:_(s16) = G_FMINIMUM %1, %3
+    %5:_(s32) = G_ANYEXT %4(s16)
+    $vgpr0 = COPY %5(s32)
+    SI_RETURN implicit $vgpr0
+...
+---
+name: test_fminimum_f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fminimum_f32
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s32), [[COPY1]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x7FF8000000000000
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMINNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fminimum_f32
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[FMINIMUM:%[0-9]+]]:_(s32) = G_FMINIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMINIMUM]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = G_FMINIMUM %0, %1
+    $vgpr0 = COPY %2(s32)
+    SI_RETURN implicit $vgpr0
+...
+---
+name: test_fminimum_f64
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+    ; GFX9-LABEL: name: test_fminimum_f64
+    ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+    ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s64) = G_FMINNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s64), [[COPY1]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x7FF8000000000000
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[FCMP]](s1), [[FMINNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[SELECT]](s64)
+    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[COPY2]](s64)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0_vgpr1
+    ;
+    ; GFX12-LABEL: name: test_fminimum_f64
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+    ; GFX12-NEXT: [[FMINIMUM:%[0-9]+]]:_(s64) = G_FMINIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[FMINIMUM]](s64)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0_vgpr1
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s64) = COPY $vgpr2_vgpr3
+    %2:_(s64) = G_FMINIMUM %0, %1
+    $vgpr0_vgpr1 = COPY %2(s64)
+    SI_RETURN implicit $vgpr0_vgpr1
+...
+---
+name: test_fminimum_v2f16
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fminimum_v2f16
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[TRUNC]](s16), [[TRUNC2]]
+    ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[TRUNC1]](s16), [[TRUNC3]]
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH7E00
+    ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[FMINNUM_IEEE]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+    ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[TRUNC4]], [[C1]]
+    ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[TRUNC5]], [[C1]]
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[SELECT]](s16), [[SELECT1]](s16)
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](<2 x s16>)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fminimum_v2f16
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX12-NEXT: [[FMINIMUM:%[0-9]+]]:_(<2 x s16>) = G_FMINIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMINIMUM]](<2 x s16>)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(<2 x s16>) = COPY $vgpr0
+    %1:_(<2 x s16>) = COPY $vgpr1
+    %2:_(<2 x s16>) = G_FMINIMUM %0, %1
+    $vgpr0 = COPY %2(<2 x s16>)
+    SI_RETURN implicit $vgpr0
+...
+---
+name: test_fminimum_v2f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+    ; GFX9-LABEL: name: test_fminimum_v2f32
+    ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[COPY]], [[COPY2]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s32), [[COPY2]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x7FF8000000000000
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMINNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32)
+    ; GFX9-NEXT: [[FMINNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[COPY1]], [[COPY3]]
+    ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY1]](s32), [[COPY3]]
+    ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[FMINNUM_IEEE1]], [[C]]
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY4]](s32)
+    ; GFX9-NEXT: $vgpr1 = COPY [[COPY5]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+    ;
+    ; GFX12-LABEL: name: test_fminimum_v2f32
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX12-NEXT: [[FMINIMUM:%[0-9]+]]:_(s32) = G_FMINIMUM [[COPY]], [[COPY2]]
+    ; GFX12-NEXT: [[FMINIMUM1:%[0-9]+]]:_(s32) = G_FMINIMUM [[COPY1]], [[COPY3]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMINIMUM]](s32)
+    ; GFX12-NEXT: $vgpr1 = COPY [[FMINIMUM1]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(<2 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32)
+    %3:_(s32) = COPY $vgpr2
+    %4:_(s32) = COPY $vgpr3
+    %5:_(<2 x s32>) = G_BUILD_VECTOR %3(s32), %4(s32)
+    %6:_(<2 x s32>) = G_FMINIMUM %2, %5
+    %7:_(s32), %8:_(s32) = G_UNMERGE_VALUES %6(<2 x s32>)
+    $vgpr0 = COPY %7(s32)
+    $vgpr1 = COPY %8(s32)
+    SI_RETURN implicit $vgpr0, implicit $vgpr1
+...
+---
+name: test_fminimum_nsz_f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fminimum_nsz_f32
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s32), [[COPY1]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x7FF8000000000000
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMINNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fminimum_nsz_f32
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[FMINIMUM:%[0-9]+]]:_(s32) = nsz G_FMINIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMINIMUM]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = nsz G_FMINIMUM %0, %1
+    $vgpr0 = COPY %2(s32)
+    SI_RETURN implicit $vgpr0
+...
+---
+name: test_fminimum_nnan_f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fminimum_nnan_f32
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[FMINNUM_IEEE]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fminimum_nnan_f32
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[FMINIMUM:%[0-9]+]]:_(s32) = nnan G_FMINIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMINIMUM]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = nnan G_FMINIMUM %0, %1
+    $vgpr0 = COPY %2(s32)
+    SI_RETURN implicit $vgpr0
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir
index c8bd8ab..423ce82 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir
@@ -18,58 +18,12 @@ body: |
     ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
-    %2:_(<2 x s32>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(0, 1)
+    %2:_(<2 x s32>) = G_BUILD_VECTOR %0, %1
     $vgpr0_vgpr1 = COPY %2
 
 ...
 
 ---
-name: shufflevector_scalar_src_dst
-tracksRegLiveness: true
-
-body: |
-  bb.0:
-    liveins: $vgpr0, $vgpr1
-
-    ; CHECK-LABEL: name: shufflevector_scalar_src_dst
-    ; CHECK: liveins: $vgpr0, $vgpr1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
-    ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32)
-    %0:_(s32) = COPY $vgpr0
-    %1:_(s32) = COPY $vgpr1
-    %2:_(s32) = G_SHUFFLE_VECTOR %0, %1, shufflemask(1)
-    $vgpr0 = COPY %2
-
-...
-
----
-name: shufflevector_scalar_dst
-tracksRegLiveness: true
-
-body: |
-  bb.0:
-    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
-
-    ; CHECK-LABEL: name: shufflevector_scalar_dst
-    ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
-    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-    ; CHECK-NEXT: $vgpr0 = COPY [[COPY3]](s32)
-    %0:_(<2 x s32>) = COPY $vgpr0_vgpr1
-    %1:_(<2 x s32>) = COPY $vgpr2_vgpr3
-    %2:_(s32) = G_SHUFFLE_VECTOR %0, %1, shufflemask(2)
-    $vgpr0 = COPY %2
-
-...
-
----
 name: shufflevector_v2s32_0_1
 tracksRegLiveness: true
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
index 5171403..7714c03 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
@@ -140,7 +140,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; CHECK-NEXT:    s_and_b32 s0, vcc_lo, exec_lo
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB9_2
 ; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
@@ -345,7 +344,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
 ; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 12, v0
 ; CHECK-NEXT:    v_cmp_lt_u32_e64 s0, 34, v1
 ; CHECK-NEXT:    s_and_b32 s0, vcc_lo, s0
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB17_2
 ; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
index 7b01f13..7b81669 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
@@ -143,7 +143,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; CHECK-NEXT:    s_and_b64 s[0:1], vcc, exec
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB9_2
 ; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
@@ -348,7 +347,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
 ; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 12, v0
 ; CHECK-NEXT:    v_cmp_lt_u32_e64 s[0:1], 34, v1
 ; CHECK-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB17_2
 ; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 8533e34..518af70 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -1750,7 +1750,7 @@ define i65 @v_lshr_i65_33(i65 %value) {
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v2
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 31
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 1, v3
-; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1763,7 +1763,7 @@ define i65 @v_lshr_i65_33(i65 %value) {
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v2
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 31, v[0:1]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 1, v3
-; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1776,7 +1776,7 @@ define i65 @v_lshr_i65_33(i65 %value) {
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v2
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 31, v[0:1]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 1, v3
-; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1789,7 +1789,7 @@ define i65 @v_lshr_i65_33(i65 %value) {
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 1, v3
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 31, v[0:1]
-; GFX10-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1800,7 +1800,7 @@ define i65 @v_lshr_i65_33(i65 %value) {
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 1, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 1, v3
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 31, v[0:1]
-; GFX11-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr i65 %value, 33
@@ -1859,21 +1859,19 @@ define amdgpu_ps i65 @s_lshr_i65_33(i65 inreg %value) {
 ; GCN-LABEL: s_lshr_i65_33:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_and_b64 s[2:3], s[2:3], 1
-; GCN-NEXT:    s_lshr_b32 s0, s1, 1
-; GCN-NEXT:    s_mov_b32 s1, 0
-; GCN-NEXT:    s_lshl_b64 s[2:3], s[2:3], 31
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    s_lshr_b32 s4, s1, 1
+; GCN-NEXT:    s_lshl_b64 s[0:1], s[2:3], 31
+; GCN-NEXT:    s_or_b32 s0, s0, s4
 ; GCN-NEXT:    s_mov_b32 s2, 0
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_lshr_i65_33:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_and_b64 s[2:3], s[2:3], 1
-; GFX10PLUS-NEXT:    s_lshr_b32 s0, s1, 1
-; GFX10PLUS-NEXT:    s_mov_b32 s1, 0
-; GFX10PLUS-NEXT:    s_lshl_b64 s[2:3], s[2:3], 31
-; GFX10PLUS-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX10PLUS-NEXT:    s_lshr_b32 s4, s1, 1
+; GFX10PLUS-NEXT:    s_lshl_b64 s[0:1], s[2:3], 31
 ; GFX10PLUS-NEXT:    s_mov_b32 s2, 0
+; GFX10PLUS-NEXT:    s_or_b32 s0, s0, s4
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = lshr i65 %value, 33
   ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/or.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/or.ll
index af377b1..e0581f01 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/or.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/or.ll
@@ -597,13 +597,13 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GFX7-NEXT:    s_mov_b32 s5, 0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s4, s[2:3], 0x0
+; GFX7-NEXT:    s_load_dword s3, s[2:3], 0x0
 ; GFX7-NEXT:    s_mov_b32 s2, -1
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_or_b64 s[4:5], s[4:5], 0x50
+; GFX7-NEXT:    s_or_b32 s4, s3, 0x50
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -616,7 +616,7 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a
 ; GFX8-NEXT:    s_mov_b32 s3, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], 0x50
+; GFX8-NEXT:    s_or_b32 s2, s2, 0x50
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -630,7 +630,7 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a
 ; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX9-NEXT:    s_mov_b32 s3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], 0x50
+; GFX9-NEXT:    s_or_b32 s2, s2, 0x50
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -644,7 +644,7 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a
 ; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX10-NEXT:    s_mov_b32 s3, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], 0x50
+; GFX10-NEXT:    s_or_b32 s2, s2, 0x50
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -658,7 +658,7 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a
 ; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX11-NEXT:    s_mov_b32 s3, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], 0x50
+; GFX11-NEXT:    s_or_b32 s2, s2, 0x50
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
@@ -671,7 +671,7 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX12-NEXT:    s_mov_b32 s3, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_or_b64 s[2:3], s[2:3], 0x50
+; GFX12-NEXT:    s_or_b32 s2, s2, 0x50
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-shuffle.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-shuffle.mir
index 6e4c6bc..31e3d97 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-shuffle.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-shuffle.mir
@@ -147,15 +147,17 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p3) = COPY $vgpr1
-    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[COPY]](p3) :: (load (<8 x s16>), align 8, addrspace 3)
-    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16), [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[LOAD]](<8 x s16>)
-    ; CHECK-NEXT: G_STORE [[UV4]](s16), [[COPY1]](p3) :: (store (s16), addrspace 3)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[PTR_ADD]](p3) :: (load (s16), addrspace 3)
+    ; CHECK-NEXT: G_STORE [[LOAD]](s16), [[COPY1]](p3) :: (store (s16), addrspace 3)
     ; CHECK-NEXT: SI_RETURN
     %0:_(p3) = COPY $vgpr0
     %1:_(p3) = COPY $vgpr1
     %2:_(<8 x s16>) = G_IMPLICIT_DEF
     %3:_(<8 x s16>) = G_LOAD %0(p3) :: (load (<8 x s16>), align 8, addrspace 3)
-    %4:_(s16) = G_SHUFFLE_VECTOR %3(<8 x s16>), %2, shufflemask(4)
+    %idx:_(s32) = G_CONSTANT i32 4
+    %4:_(s16) = G_EXTRACT_VECTOR_ELT %3(<8 x s16>), %idx
     G_STORE %4(s16), %1(p3) :: (store (s16), addrspace 3)
     SI_RETURN
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
index a9b3deb..cfe655f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
@@ -1381,7 +1381,7 @@ define i65 @v_sext_inreg_i65_33(i65 %value) {
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[1:2], 31
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
-; GFX6-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1393,7 +1393,7 @@ define i65 @v_sext_inreg_i65_33(i65 %value) {
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 31, v[1:2]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
-; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1405,7 +1405,7 @@ define i65 @v_sext_inreg_i65_33(i65 %value) {
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 31, v[1:2]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
-; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1418,7 +1418,7 @@ define i65 @v_sext_inreg_i65_33(i65 %value) {
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX10PLUS-NEXT:    v_lshlrev_b64 v[0:1], 31, v[1:2]
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
-; GFX10PLUS-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX10PLUS-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl i65 %value, 33
   %ashr = ashr i65 %value, 33
@@ -1429,29 +1429,27 @@ define amdgpu_ps i65 @s_sext_inreg_i65_18(i65 inreg %value) {
 ; GCN-LABEL: s_sext_inreg_i65_18:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_lshl_b64 s[2:3], s[2:3], 18
-; GCN-NEXT:    s_lshr_b32 s4, s1, 14
-; GCN-NEXT:    s_mov_b32 s5, 0
-; GCN-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GCN-NEXT:    s_lshr_b32 s3, s1, 14
+; GCN-NEXT:    s_or_b32 s2, s2, s3
 ; GCN-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
 ; GCN-NEXT:    s_bfe_u64 s[0:1], s[0:1], 0x2e0000
-; GCN-NEXT:    s_lshl_b32 s7, s2, 14
-; GCN-NEXT:    s_mov_b32 s6, s5
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
+; GCN-NEXT:    s_lshl_b32 s5, s2, 14
+; GCN-NEXT:    s_mov_b32 s4, 0
+; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 18
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_sext_inreg_i65_18:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_lshl_b64 s[2:3], s[2:3], 18
-; GFX10PLUS-NEXT:    s_lshr_b32 s4, s1, 14
-; GFX10PLUS-NEXT:    s_mov_b32 s5, 0
+; GFX10PLUS-NEXT:    s_lshr_b32 s3, s1, 14
 ; GFX10PLUS-NEXT:    s_bfe_u64 s[0:1], s[0:1], 0x2e0000
-; GFX10PLUS-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
-; GFX10PLUS-NEXT:    s_mov_b32 s6, s5
+; GFX10PLUS-NEXT:    s_or_b32 s2, s2, s3
+; GFX10PLUS-NEXT:    s_mov_b32 s4, 0
 ; GFX10PLUS-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GFX10PLUS-NEXT:    s_lshl_b32 s7, s2, 14
+; GFX10PLUS-NEXT:    s_lshl_b32 s5, s2, 14
 ; GFX10PLUS-NEXT:    s_ashr_i64 s[2:3], s[2:3], 18
-; GFX10PLUS-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
+; GFX10PLUS-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %shl = shl i65 %value, 18
   %ashr = ashr i65 %shl, 18
@@ -1464,13 +1462,12 @@ define amdgpu_ps i65 @s_sext_inreg_i65_33(i65 inreg %value) {
 ; GCN-NEXT:    s_lshl_b32 s3, s2, 1
 ; GCN-NEXT:    s_mov_b32 s2, 0
 ; GCN-NEXT:    s_lshr_b64 s[4:5], s[0:1], 31
-; GCN-NEXT:    s_or_b64 s[4:5], s[2:3], s[4:5]
-; GCN-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x10000
-; GCN-NEXT:    s_bfe_u32 s0, s0, 0x1f0000
-; GCN-NEXT:    s_mov_b32 s1, s2
-; GCN-NEXT:    s_lshl_b64 s[2:3], s[4:5], 31
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
-; GCN-NEXT:    s_ashr_i32 s2, s5, 1
+; GCN-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GCN-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GCN-NEXT:    s_bfe_u32 s4, s0, 0x1f0000
+; GCN-NEXT:    s_lshl_b64 s[0:1], s[2:3], 31
+; GCN-NEXT:    s_or_b32 s0, s0, s4
+; GCN-NEXT:    s_ashr_i32 s2, s3, 1
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_sext_inreg_i65_33:
@@ -1478,13 +1475,12 @@ define amdgpu_ps i65 @s_sext_inreg_i65_33(i65 inreg %value) {
 ; GFX10PLUS-NEXT:    s_lshl_b32 s3, s2, 1
 ; GFX10PLUS-NEXT:    s_mov_b32 s2, 0
 ; GFX10PLUS-NEXT:    s_lshr_b64 s[4:5], s[0:1], 31
-; GFX10PLUS-NEXT:    s_bfe_u32 s0, s0, 0x1f0000
-; GFX10PLUS-NEXT:    s_or_b64 s[4:5], s[2:3], s[4:5]
-; GFX10PLUS-NEXT:    s_mov_b32 s1, s2
-; GFX10PLUS-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x10000
-; GFX10PLUS-NEXT:    s_lshl_b64 s[2:3], s[4:5], 31
-; GFX10PLUS-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
-; GFX10PLUS-NEXT:    s_ashr_i32 s2, s5, 1
+; GFX10PLUS-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX10PLUS-NEXT:    s_bfe_u32 s4, s0, 0x1f0000
+; GFX10PLUS-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX10PLUS-NEXT:    s_lshl_b64 s[0:1], s[2:3], 31
+; GFX10PLUS-NEXT:    s_ashr_i32 s2, s3, 1
+; GFX10PLUS-NEXT:    s_or_b32 s0, s0, s4
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %shl = shl i65 %value, 33
   %ashr = ashr i65 %shl, 33
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector-pointer-crash.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector-pointer-crash.mir
index ac903ad..e7bba9d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector-pointer-crash.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector-pointer-crash.mir
@@ -19,15 +19,15 @@ body: |
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p0>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; CHECK-NEXT: [[UV:%[0-9]+]]:_(p0), [[UV1:%[0-9]+]]:_(p0) = G_UNMERGE_VALUES [[BITCAST]](<2 x p0>)
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY [[UV]](p0)
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY [[COPY]](p0)
-    ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](p0)
+    ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](p0)
     ; CHECK-NEXT: $vgpr0 = COPY [[UV2]](s32)
     ; CHECK-NEXT: $vgpr1 = COPY [[UV3]](s32)
     ; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
     %0:_(p0) = G_CONSTANT i64 0
     %1:_(<2 x p0>) = G_BUILD_VECTOR %0:_(p0), %0:_(p0)
     %2:_(<2 x p0>) = G_LOAD %0:_(p0) :: (load (<2 x p0>))
-    %3:_(p0) = G_SHUFFLE_VECTOR %2:_(<2 x p0>), %1:_, shufflemask(0)
+    %idx:_(s32) = G_CONSTANT i32 0
+    %3:_(p0) = G_EXTRACT_VECTOR_ELT %2:_(<2 x p0>), %idx
     %4:_(s32), %5:_(s32) = G_UNMERGE_VALUES %3:_(p0)
     $vgpr0 = COPY %4:_(s32)
     $vgpr1 = COPY %5:_(s32)
diff --git a/llvm/test/CodeGen/AMDGPU/add-max.ll b/llvm/test/CodeGen/AMDGPU/add-max.ll
index b3a7057..c551375 100644
--- a/llvm/test/CodeGen/AMDGPU/add-max.ll
+++ b/llvm/test/CodeGen/AMDGPU/add-max.ll
@@ -7,7 +7,7 @@ define amdgpu_ps float @add_max_u32_vvv(i32 %a, i32 %b, i32 %c) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_add_max_u32 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, %b
+  %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
   %ret = bitcast i32 %max to float
   ret float %ret
@@ -18,39 +18,38 @@ define amdgpu_ps float @add_max_u32_svv(i32 inreg %a, i32 %b, i32 %c) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_add_max_u32 v0, s0, v0, v1
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, %b
+  %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
   %ret = bitcast i32 %max to float
   ret float %ret
 }
 
 define amdgpu_ps float @add_max_u32_ssv(i32 inreg %a, i32 inreg %b, i32 %c) {
-; SDAG-LABEL: add_max_u32_ssv:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    v_add_max_u32 v0, s0, s1, v0
-; SDAG-NEXT:    ; return to shader part epilog
-;
-; GISEL-LABEL: add_max_u32_ssv:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_add_co_i32 s0, s0, s1
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-NEXT:    v_max_u32_e32 v0, s0, v0
-; GISEL-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, %b
+; GCN-LABEL: add_max_u32_ssv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_add_max_u32 v0, s0, s1, v0
+; GCN-NEXT:    ; return to shader part epilog
+  %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
   %ret = bitcast i32 %max to float
   ret float %ret
 }
 
 define amdgpu_ps float @add_max_u32_sss(i32 inreg %a, i32 inreg %b, i32 inreg %c) {
-; GCN-LABEL: add_max_u32_sss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_add_co_i32 s0, s0, s1
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT:    s_max_u32 s0, s0, s2
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, %b
+; SDAG-LABEL: add_max_u32_sss:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    v_add_nc_u32_e64 v0, s0, s1 clamp
+; SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; SDAG-NEXT:    v_max_u32_e32 v0, s2, v0
+; SDAG-NEXT:    ; return to shader part epilog
+;
+; GISEL-LABEL: add_max_u32_sss:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_add_max_u32 v0, s0, s1, v0
+; GISEL-NEXT:    ; return to shader part epilog
+  %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
   %ret = bitcast i32 %max to float
   ret float %ret
@@ -61,7 +60,7 @@ define amdgpu_ps float @add_max_u32_vsi(i32 %a, i32 inreg %b) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_add_max_u32 v0, v0, s0, 4
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, %b
+  %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.umax.i32(i32 %add, i32 4)
   %ret = bitcast i32 %max to float
   ret float %ret
@@ -72,26 +71,19 @@ define amdgpu_ps float @add_max_u32_svl(i32 inreg %a, i32 %b) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_add_max_u32 v0, s0, v0, 0x64
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, %b
+  %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.umax.i32(i32 %add, i32 100)
   %ret = bitcast i32 %max to float
   ret float %ret
 }
 
-define amdgpu_ps float @add_max_u32_slv(i32 inreg %a, i32 %b) {
-; SDAG-LABEL: add_max_u32_slv:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    v_add_max_u32 v0, 0x64, s0, v0
-; SDAG-NEXT:    ; return to shader part epilog
-;
-; GISEL-LABEL: add_max_u32_slv:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_addk_co_i32 s0, 0x64
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-NEXT:    v_max_u32_e32 v0, s0, v0
-; GISEL-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, 100
-  %max = call i32 @llvm.umax.i32(i32 %add, i32 %b)
+define amdgpu_ps float @add_max_u32_slv(i32 inreg %a, i32 %b, i32 %c) {
+; GCN-LABEL: add_max_u32_slv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_add_max_u32 v0, s0, v0, v1
+; GCN-NEXT:    ; return to shader part epilog
+  %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
+  %max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
   %ret = bitcast i32 %max to float
   ret float %ret
 }
@@ -101,7 +93,7 @@ define amdgpu_ps float @add_max_i32_vvv(i32 %a, i32 %b, i32 %c) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_add_max_i32 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, %b
+  %add = call i32 @llvm.sadd.sat.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.smax.i32(i32 %add, i32 %c)
   %ret = bitcast i32 %max to float
   ret float %ret
@@ -112,7 +104,7 @@ define amdgpu_ps float @add_min_u32_vvv(i32 %a, i32 %b, i32 %c) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_add_min_u32 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, %b
+  %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.umin.i32(i32 %add, i32 %c)
   %ret = bitcast i32 %max to float
   ret float %ret
@@ -123,7 +115,7 @@ define amdgpu_ps float @add_min_i32_vvv(i32 %a, i32 %b, i32 %c) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_add_min_i32 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, %b
+  %add = call i32 @llvm.sadd.sat.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.smin.i32(i32 %add, i32 %c)
   %ret = bitcast i32 %max to float
   ret float %ret
@@ -134,7 +126,7 @@ define amdgpu_ps float @add_max_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16>
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_pk_add_max_u16 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, %b
+  %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
   %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
@@ -145,29 +137,18 @@ define amdgpu_ps float @add_max_v2u16_svv(<2 x i16> inreg %a, <2 x i16> %b, <2 x
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_pk_add_max_u16 v0, s0, v0, v1
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, %b
+  %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
   %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
 }
 
 define amdgpu_ps float @add_max_v2u16_ssv(<2 x i16> inreg %a, <2 x i16> inreg %b, <2 x i16> %c) {
-; SDAG-LABEL: add_max_v2u16_ssv:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    v_pk_add_max_u16 v0, s0, s1, v0
-; SDAG-NEXT:    ; return to shader part epilog
-;
-; GISEL-LABEL: add_max_v2u16_ssv:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_lshr_b32 s2, s0, 16
-; GISEL-NEXT:    s_lshr_b32 s3, s1, 16
-; GISEL-NEXT:    s_add_co_i32 s0, s0, s1
-; GISEL-NEXT:    s_add_co_i32 s2, s2, s3
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
-; GISEL-NEXT:    v_pk_max_u16 v0, s0, v0
-; GISEL-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, %b
+; GCN-LABEL: add_max_v2u16_ssv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_pk_add_max_u16 v0, s0, s1, v0
+; GCN-NEXT:    ; return to shader part epilog
+  %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
   %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
@@ -176,30 +157,18 @@ define amdgpu_ps float @add_max_v2u16_ssv(<2 x i16> inreg %a, <2 x i16> inreg %b
 define amdgpu_ps float @add_max_v2u16_sss(<2 x i16> inreg %a, <2 x i16> inreg %b, <2 x i16> inreg %c) {
 ; SDAG-LABEL: add_max_v2u16_sss:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    v_pk_add_u16 v0, s0, s1
+; SDAG-NEXT:    v_pk_add_u16 v0, s0, s1 clamp
 ; SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-NEXT:    v_pk_max_u16 v0, v0, s2
 ; SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GISEL-LABEL: add_max_v2u16_sss:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_lshr_b32 s3, s0, 16
-; GISEL-NEXT:    s_lshr_b32 s4, s1, 16
-; GISEL-NEXT:    s_add_co_i32 s0, s0, s1
-; GISEL-NEXT:    s_add_co_i32 s3, s3, s4
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
-; GISEL-NEXT:    s_and_b32 s3, s2, 0xffff
-; GISEL-NEXT:    s_lshr_b32 s1, s0, 16
-; GISEL-NEXT:    s_and_b32 s0, s0, 0xffff
-; GISEL-NEXT:    s_lshr_b32 s2, s2, 16
-; GISEL-NEXT:    s_max_u32 s0, s0, s3
-; GISEL-NEXT:    s_max_u32 s1, s1, s2
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
-; GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_pk_add_max_u16 v0, s0, s1, v0
 ; GISEL-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, %b
+  %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
   %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
@@ -210,7 +179,7 @@ define amdgpu_ps float @add_max_v2u16_vsi(<2 x i16> %a, <2 x i16> inreg %b) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_pk_add_max_u16 v0, v0, s0, 4
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, %b
+  %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
   %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> <i16 4, i16 0>)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
@@ -221,28 +190,18 @@ define amdgpu_ps float @add_max_v2u16_svl(<2 x i16> inreg %a, <2 x i16> %b) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_pk_add_max_u16 v0, s0, v0, 0x650064
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, %b
+  %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
   %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> <i16 100, i16 101>)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
 }
 
 define amdgpu_ps float @add_max_v2u16_slv(<2 x i16> inreg %a, <2 x i16> %b) {
-; SDAG-LABEL: add_max_v2u16_slv:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    v_pk_add_max_u16 v0, 0x640064, s0, v0
-; SDAG-NEXT:    ; return to shader part epilog
-;
-; GISEL-LABEL: add_max_v2u16_slv:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_lshr_b32 s1, s0, 16
-; GISEL-NEXT:    s_add_co_i32 s0, s0, 0x640064
-; GISEL-NEXT:    s_addk_co_i32 s1, 0x64
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
-; GISEL-NEXT:    v_pk_max_u16 v0, s0, v0
-; GISEL-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, <i16 100, i16 100>
+; GCN-LABEL: add_max_v2u16_slv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_pk_add_max_u16 v0, 0x640064, s0, v0
+; GCN-NEXT:    ; return to shader part epilog
+  %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> <i16 100, i16 100>)
   %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %b)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
@@ -253,7 +212,7 @@ define amdgpu_ps float @add_max_v2s16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16>
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_pk_add_max_i16 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, %b
+  %add = call <2 x i16> @llvm.sadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
   %max = call <2 x i16> @llvm.smax.v216(<2 x i16> %add, <2 x i16> %c)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
@@ -264,7 +223,7 @@ define amdgpu_ps float @add_min_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16>
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_pk_add_min_u16 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, %b
+  %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
   %max = call <2 x i16> @llvm.umin.v216(<2 x i16> %add, <2 x i16> %c)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
@@ -275,7 +234,7 @@ define amdgpu_ps float @add_min_v2s16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16>
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_pk_add_min_i16 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, %b
+  %add = call <2 x i16> @llvm.sadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
   %max = call <2 x i16> @llvm.smin.v216(<2 x i16> %add, <2 x i16> %c)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
diff --git a/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
index b72eba8..8088c1b 100644
--- a/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
+++ b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
@@ -180,11 +180,7 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B
 ; CHECK-LABEL: s_add64_32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 s0, s0, s2
-; CHECK-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; CHECK-NEXT:    s_addc_u32 s1, s1, s3
-; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; CHECK-NEXT:    s_addc_u32 s2, s4, 0
 ; CHECK-NEXT:    ; return to shader part epilog
   %sum64 = add i64 %val64A, %val64B
@@ -199,14 +195,10 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B
 define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
 ; CHECK-LABEL: s_uadd_v2i64:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s10, s2, s6
-; CHECK-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; CHECK-NEXT:    s_addc_u32 s8, s3, s7
+; CHECK-NEXT:    s_add_u32 s6, s2, s6
+; CHECK-NEXT:    s_addc_u32 s7, s3, s7
 ; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; CHECK-NEXT:    s_add_u32 s0, s0, s4
-; CHECK-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; CHECK-NEXT:    s_addc_u32 s1, s1, s5
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1
@@ -215,8 +207,8 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v7
 ; CHECK-NEXT:    v_readfirstlane_b32 s2, v6
-; CHECK-NEXT:    v_mov_b32_e32 v4, s10
-; CHECK-NEXT:    v_mov_b32_e32 v5, s8
+; CHECK-NEXT:    v_mov_b32_e32 v4, s6
+; CHECK-NEXT:    v_mov_b32_e32 v5, s7
 ; CHECK-NEXT:    s_mov_b32 s1, s0
 ; CHECK-NEXT:    s_mov_b32 s3, s2
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
@@ -233,14 +225,10 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
 define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
 ; CHECK-LABEL: s_usub_v2i64:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_sub_u32 s10, s2, s6
-; CHECK-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; CHECK-NEXT:    s_subb_u32 s8, s3, s7
+; CHECK-NEXT:    s_sub_u32 s6, s2, s6
+; CHECK-NEXT:    s_subb_u32 s7, s3, s7
 ; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; CHECK-NEXT:    s_sub_u32 s0, s0, s4
-; CHECK-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; CHECK-NEXT:    s_subb_u32 s1, s1, s5
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1
@@ -249,8 +237,8 @@ define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v7
 ; CHECK-NEXT:    v_readfirstlane_b32 s2, v6
-; CHECK-NEXT:    v_mov_b32_e32 v4, s10
-; CHECK-NEXT:    v_mov_b32_e32 v5, s8
+; CHECK-NEXT:    v_mov_b32_e32 v4, s6
+; CHECK-NEXT:    v_mov_b32_e32 v5, s7
 ; CHECK-NEXT:    s_mov_b32 s1, s0
 ; CHECK-NEXT:    s_mov_b32 s3, s2
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
@@ -268,8 +256,6 @@ define amdgpu_ps i64 @s_uadd_i64(i64 inreg %val0, i64 inreg %val1, ptr %ptrval)
 ; CHECK-LABEL: s_uadd_i64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 s0, s0, s2
-; CHECK-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; CHECK-NEXT:    s_addc_u32 s1, s1, s3
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1
@@ -292,8 +278,6 @@ define amdgpu_ps i64 @s_uadd_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
 ; CHECK-LABEL: s_uadd_p1:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 s0, s0, 1
-; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1
@@ -339,8 +323,6 @@ define amdgpu_ps i64 @s_usub_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
 ; CHECK-LABEL: s_usub_p1:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_sub_u32 s0, s0, 1
-; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; CHECK-NEXT:    s_subb_u32 s1, s1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1
@@ -363,8 +345,6 @@ define amdgpu_ps i64 @s_usub_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
 ; CHECK-LABEL: s_usub_n1:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_sub_u32 s0, s0, -1
-; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; CHECK-NEXT:    s_subb_u32 s1, s1, -1
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 948811e..51df8c3 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -7821,10 +7821,9 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_addc_u32 s15, 0, s16
 ; GFX6-NEXT:    s_add_u32 s16, s0, s1
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s16
-; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s12, v0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX6-NEXT:    s_addc_u32 s14, s14, s15
 ; GFX6-NEXT:    s_mul_i32 s0, s12, s14
 ; GFX6-NEXT:    v_readfirstlane_b32 s1, v0
@@ -7855,7 +7854,6 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_add_u32 s15, s16, s0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX6-NEXT:    s_addc_u32 s14, s14, s12
 ; GFX6-NEXT:    s_ashr_i32 s12, s7, 31
 ; GFX6-NEXT:    s_add_u32 s0, s6, s12
@@ -7881,52 +7879,50 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX6-NEXT:    s_addc_u32 s4, s4, 0
 ; GFX6-NEXT:    s_mul_i32 s14, s7, s14
-; GFX6-NEXT:    s_add_u32 s14, s1, s14
-; GFX6-NEXT:    v_mov_b32_e32 v0, s14
+; GFX6-NEXT:    s_add_u32 s16, s1, s14
+; GFX6-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s10, v0
-; GFX6-NEXT:    s_addc_u32 s15, 0, s4
+; GFX6-NEXT:    s_addc_u32 s17, 0, s4
 ; GFX6-NEXT:    s_mov_b32 s1, s5
-; GFX6-NEXT:    s_mul_i32 s4, s10, s15
+; GFX6-NEXT:    s_mul_i32 s4, s10, s17
 ; GFX6-NEXT:    v_readfirstlane_b32 s5, v0
 ; GFX6-NEXT:    s_add_i32 s4, s5, s4
-; GFX6-NEXT:    s_mul_i32 s5, s11, s14
-; GFX6-NEXT:    s_add_i32 s16, s4, s5
-; GFX6-NEXT:    s_sub_i32 s17, s7, s16
-; GFX6-NEXT:    s_mul_i32 s4, s10, s14
+; GFX6-NEXT:    s_mul_i32 s5, s11, s16
+; GFX6-NEXT:    s_add_i32 s18, s4, s5
+; GFX6-NEXT:    s_sub_i32 s14, s7, s18
+; GFX6-NEXT:    s_mul_i32 s4, s10, s16
 ; GFX6-NEXT:    s_sub_u32 s6, s6, s4
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX6-NEXT:    s_or_b32 s18, s4, s5
-; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX6-NEXT:    s_subb_u32 s17, s17, s11
-; GFX6-NEXT:    s_sub_u32 s19, s6, s10
-; GFX6-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX6-NEXT:    s_or_b32 s15, s4, s5
+; GFX6-NEXT:    s_subb_u32 s19, s14, s11
+; GFX6-NEXT:    s_sub_u32 s20, s6, s10
+; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT:    s_or_b32 s14, s14, s15
+; GFX6-NEXT:    s_subb_u32 s14, s19, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s14, s11
+; GFX6-NEXT:    s_cselect_b32 s15, -1, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s20, s10
+; GFX6-NEXT:    s_cselect_b32 s19, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s14, s11
+; GFX6-NEXT:    s_cselect_b32 s14, s19, s15
+; GFX6-NEXT:    s_add_u32 s15, s16, 1
+; GFX6-NEXT:    s_addc_u32 s19, s17, 0
+; GFX6-NEXT:    s_add_u32 s20, s16, 2
+; GFX6-NEXT:    s_addc_u32 s21, s17, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX6-NEXT:    s_cselect_b32 s14, s20, s15
+; GFX6-NEXT:    s_cselect_b32 s15, s21, s19
 ; GFX6-NEXT:    s_or_b32 s4, s4, s5
-; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX6-NEXT:    s_subb_u32 s4, s17, 0
+; GFX6-NEXT:    s_subb_u32 s4, s7, s18
 ; GFX6-NEXT:    s_cmp_ge_u32 s4, s11
 ; GFX6-NEXT:    s_cselect_b32 s5, -1, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s19, s10
-; GFX6-NEXT:    s_cselect_b32 s17, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s4, s11
-; GFX6-NEXT:    s_cselect_b32 s4, s17, s5
-; GFX6-NEXT:    s_add_u32 s5, s14, 1
-; GFX6-NEXT:    s_addc_u32 s17, s15, 0
-; GFX6-NEXT:    s_add_u32 s19, s14, 2
-; GFX6-NEXT:    s_addc_u32 s20, s15, 0
-; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX6-NEXT:    s_cselect_b32 s4, s19, s5
-; GFX6-NEXT:    s_cselect_b32 s5, s20, s17
-; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX6-NEXT:    s_subb_u32 s7, s7, s16
-; GFX6-NEXT:    s_cmp_ge_u32 s7, s11
-; GFX6-NEXT:    s_cselect_b32 s16, -1, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s6, s10
 ; GFX6-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s7, s11
-; GFX6-NEXT:    s_cselect_b32 s6, s6, s16
-; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX6-NEXT:    s_cselect_b32 s5, s5, s15
-; GFX6-NEXT:    s_cselect_b32 s4, s4, s14
+; GFX6-NEXT:    s_cmp_eq_u32 s4, s11
+; GFX6-NEXT:    s_cselect_b32 s4, s6, s5
+; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX6-NEXT:    s_cselect_b32 s5, s15, s17
+; GFX6-NEXT:    s_cselect_b32 s4, s14, s16
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[12:13], s[8:9]
 ; GFX6-NEXT:    s_xor_b64 s[4:5], s[4:5], s[6:7]
 ; GFX6-NEXT:    s_sub_u32 s4, s4, s6
@@ -7949,8 +7945,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    s_sub_u32 s10, 0, s8
-; GFX9-NEXT:    s_subb_u32 s11, 0, s9
+; GFX9-NEXT:    s_sub_u32 s4, 0, s8
+; GFX9-NEXT:    s_subb_u32 s5, 0, s9
 ; GFX9-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GFX9-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -7960,56 +7956,52 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    v_madmk_f32 v1, v2, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s12, v2
-; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX9-NEXT:    s_mul_i32 s5, s10, s12
-; GFX9-NEXT:    s_mul_hi_u32 s14, s10, s4
-; GFX9-NEXT:    s_mul_i32 s13, s11, s4
-; GFX9-NEXT:    s_add_i32 s5, s14, s5
-; GFX9-NEXT:    s_mul_i32 s15, s10, s4
-; GFX9-NEXT:    s_add_i32 s5, s5, s13
-; GFX9-NEXT:    s_mul_hi_u32 s14, s4, s15
-; GFX9-NEXT:    s_mul_i32 s16, s4, s5
-; GFX9-NEXT:    s_mul_hi_u32 s13, s4, s5
+; GFX9-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s11, v1
+; GFX9-NEXT:    s_mul_i32 s12, s4, s10
+; GFX9-NEXT:    s_mul_hi_u32 s14, s4, s11
+; GFX9-NEXT:    s_mul_i32 s13, s5, s11
+; GFX9-NEXT:    s_add_i32 s12, s14, s12
+; GFX9-NEXT:    s_mul_i32 s15, s4, s11
+; GFX9-NEXT:    s_add_i32 s12, s12, s13
+; GFX9-NEXT:    s_mul_hi_u32 s14, s11, s15
+; GFX9-NEXT:    s_mul_i32 s16, s11, s12
+; GFX9-NEXT:    s_mul_hi_u32 s13, s11, s12
 ; GFX9-NEXT:    s_add_u32 s14, s14, s16
 ; GFX9-NEXT:    s_addc_u32 s13, 0, s13
-; GFX9-NEXT:    s_mul_hi_u32 s17, s12, s15
-; GFX9-NEXT:    s_mul_i32 s15, s12, s15
+; GFX9-NEXT:    s_mul_hi_u32 s17, s10, s15
+; GFX9-NEXT:    s_mul_i32 s15, s10, s15
 ; GFX9-NEXT:    s_add_u32 s14, s14, s15
-; GFX9-NEXT:    s_mul_hi_u32 s16, s12, s5
+; GFX9-NEXT:    s_mul_hi_u32 s16, s10, s12
 ; GFX9-NEXT:    s_addc_u32 s13, s13, s17
 ; GFX9-NEXT:    s_addc_u32 s14, s16, 0
-; GFX9-NEXT:    s_mul_i32 s5, s12, s5
-; GFX9-NEXT:    s_add_u32 s5, s13, s5
+; GFX9-NEXT:    s_mul_i32 s12, s10, s12
+; GFX9-NEXT:    s_add_u32 s12, s13, s12
 ; GFX9-NEXT:    s_addc_u32 s13, 0, s14
-; GFX9-NEXT:    s_add_u32 s14, s4, s5
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s12, s12, s13
-; GFX9-NEXT:    s_mul_i32 s4, s10, s12
-; GFX9-NEXT:    s_mul_hi_u32 s5, s10, s14
-; GFX9-NEXT:    s_add_i32 s4, s5, s4
-; GFX9-NEXT:    s_mul_i32 s11, s11, s14
-; GFX9-NEXT:    s_add_i32 s4, s4, s11
-; GFX9-NEXT:    s_mul_i32 s10, s10, s14
-; GFX9-NEXT:    s_mul_hi_u32 s11, s12, s10
-; GFX9-NEXT:    s_mul_i32 s13, s12, s10
-; GFX9-NEXT:    s_mul_i32 s16, s14, s4
-; GFX9-NEXT:    s_mul_hi_u32 s10, s14, s10
-; GFX9-NEXT:    s_mul_hi_u32 s15, s14, s4
-; GFX9-NEXT:    s_add_u32 s10, s10, s16
+; GFX9-NEXT:    s_add_u32 s11, s11, s12
+; GFX9-NEXT:    s_addc_u32 s10, s10, s13
+; GFX9-NEXT:    s_mul_i32 s12, s4, s10
+; GFX9-NEXT:    s_mul_hi_u32 s13, s4, s11
+; GFX9-NEXT:    s_add_i32 s12, s13, s12
+; GFX9-NEXT:    s_mul_i32 s5, s5, s11
+; GFX9-NEXT:    s_add_i32 s12, s12, s5
+; GFX9-NEXT:    s_mul_i32 s4, s4, s11
+; GFX9-NEXT:    s_mul_hi_u32 s13, s10, s4
+; GFX9-NEXT:    s_mul_i32 s14, s10, s4
+; GFX9-NEXT:    s_mul_i32 s16, s11, s12
+; GFX9-NEXT:    s_mul_hi_u32 s4, s11, s4
+; GFX9-NEXT:    s_mul_hi_u32 s15, s11, s12
+; GFX9-NEXT:    s_add_u32 s4, s4, s16
 ; GFX9-NEXT:    s_addc_u32 s15, 0, s15
-; GFX9-NEXT:    s_add_u32 s10, s10, s13
-; GFX9-NEXT:    s_mul_hi_u32 s5, s12, s4
-; GFX9-NEXT:    s_addc_u32 s10, s15, s11
+; GFX9-NEXT:    s_add_u32 s4, s4, s14
+; GFX9-NEXT:    s_mul_hi_u32 s5, s10, s12
+; GFX9-NEXT:    s_addc_u32 s4, s15, s13
 ; GFX9-NEXT:    s_addc_u32 s5, s5, 0
-; GFX9-NEXT:    s_mul_i32 s4, s12, s4
-; GFX9-NEXT:    s_add_u32 s4, s10, s4
-; GFX9-NEXT:    s_addc_u32 s10, 0, s5
-; GFX9-NEXT:    s_add_u32 s11, s14, s4
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s10, s12, s10
+; GFX9-NEXT:    s_mul_i32 s12, s10, s12
+; GFX9-NEXT:    s_add_u32 s4, s4, s12
+; GFX9-NEXT:    s_addc_u32 s5, 0, s5
+; GFX9-NEXT:    s_add_u32 s11, s11, s4
+; GFX9-NEXT:    s_addc_u32 s10, s10, s5
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX9-NEXT:    s_add_u32 s2, s2, s4
@@ -8028,38 +8020,35 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    s_addc_u32 s11, s12, s15
 ; GFX9-NEXT:    s_addc_u32 s12, s14, 0
 ; GFX9-NEXT:    s_mul_i32 s10, s3, s10
-; GFX9-NEXT:    s_add_u32 s14, s11, s10
-; GFX9-NEXT:    s_addc_u32 s15, 0, s12
-; GFX9-NEXT:    s_mul_i32 s10, s8, s15
-; GFX9-NEXT:    s_mul_hi_u32 s11, s8, s14
+; GFX9-NEXT:    s_add_u32 s13, s11, s10
+; GFX9-NEXT:    s_addc_u32 s12, 0, s12
+; GFX9-NEXT:    s_mul_i32 s10, s8, s12
+; GFX9-NEXT:    s_mul_hi_u32 s11, s8, s13
 ; GFX9-NEXT:    s_add_i32 s10, s11, s10
-; GFX9-NEXT:    s_mul_i32 s11, s9, s14
-; GFX9-NEXT:    s_add_i32 s16, s10, s11
-; GFX9-NEXT:    s_sub_i32 s12, s3, s16
-; GFX9-NEXT:    s_mul_i32 s10, s8, s14
+; GFX9-NEXT:    s_mul_i32 s11, s9, s13
+; GFX9-NEXT:    s_add_i32 s14, s10, s11
+; GFX9-NEXT:    s_sub_i32 s15, s3, s14
+; GFX9-NEXT:    s_mul_i32 s10, s8, s13
 ; GFX9-NEXT:    s_sub_u32 s2, s2, s10
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT:    s_subb_u32 s17, s12, s9
-; GFX9-NEXT:    s_sub_u32 s18, s2, s8
-; GFX9-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[12:13], 0
-; GFX9-NEXT:    s_subb_u32 s12, s17, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s12, s9
-; GFX9-NEXT:    s_cselect_b32 s13, -1, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s18, s8
+; GFX9-NEXT:    s_subb_u32 s15, s15, s9
+; GFX9-NEXT:    s_sub_u32 s16, s2, s8
+; GFX9-NEXT:    s_subb_u32 s15, s15, 0
+; GFX9-NEXT:    s_cmp_ge_u32 s15, s9
 ; GFX9-NEXT:    s_cselect_b32 s17, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s12, s9
-; GFX9-NEXT:    s_cselect_b32 s12, s17, s13
-; GFX9-NEXT:    s_add_u32 s13, s14, 1
-; GFX9-NEXT:    s_addc_u32 s17, s15, 0
-; GFX9-NEXT:    s_add_u32 s18, s14, 2
-; GFX9-NEXT:    s_addc_u32 s19, s15, 0
-; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX9-NEXT:    s_cselect_b32 s12, s18, s13
-; GFX9-NEXT:    s_cselect_b32 s13, s19, s17
+; GFX9-NEXT:    s_cmp_ge_u32 s16, s8
+; GFX9-NEXT:    s_cselect_b32 s16, -1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s15, s9
+; GFX9-NEXT:    s_cselect_b32 s15, s16, s17
+; GFX9-NEXT:    s_add_u32 s16, s13, 1
+; GFX9-NEXT:    s_addc_u32 s17, s12, 0
+; GFX9-NEXT:    s_add_u32 s18, s13, 2
+; GFX9-NEXT:    s_addc_u32 s19, s12, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX9-NEXT:    s_cselect_b32 s15, s18, s16
+; GFX9-NEXT:    s_cselect_b32 s16, s19, s17
 ; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT:    s_subb_u32 s3, s3, s16
+; GFX9-NEXT:    s_subb_u32 s3, s3, s14
 ; GFX9-NEXT:    s_cmp_ge_u32 s3, s9
 ; GFX9-NEXT:    s_cselect_b32 s10, -1, 0
 ; GFX9-NEXT:    s_cmp_ge_u32 s2, s8
@@ -8067,8 +8056,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    s_cmp_eq_u32 s3, s9
 ; GFX9-NEXT:    s_cselect_b32 s2, s2, s10
 ; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX9-NEXT:    s_cselect_b32 s3, s13, s15
-; GFX9-NEXT:    s_cselect_b32 s2, s12, s14
+; GFX9-NEXT:    s_cselect_b32 s3, s16, s12
+; GFX9-NEXT:    s_cselect_b32 s2, s15, s13
 ; GFX9-NEXT:    s_xor_b64 s[4:5], s[4:5], s[6:7]
 ; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
 ; GFX9-NEXT:    s_sub_u32 s2, s2, s4
@@ -8328,10 +8317,9 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_addc_u32 s17, 0, s18
 ; GFX6-NEXT:    s_add_u32 s18, s12, s13
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s18
-; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s14, v0
+; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX6-NEXT:    s_addc_u32 s16, s16, s17
 ; GFX6-NEXT:    s_mul_i32 s12, s14, s16
 ; GFX6-NEXT:    v_readfirstlane_b32 s13, v0
@@ -8362,7 +8350,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_add_u32 s15, s18, s12
 ; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX6-NEXT:    s_addc_u32 s14, s16, s14
 ; GFX6-NEXT:    s_ashr_i32 s12, s9, 31
 ; GFX6-NEXT:    s_add_u32 s8, s8, s12
@@ -8387,55 +8374,53 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_readfirstlane_b32 s16, v0
 ; GFX6-NEXT:    s_addc_u32 s16, s16, 0
 ; GFX6-NEXT:    s_mul_i32 s14, s9, s14
-; GFX6-NEXT:    s_add_u32 s17, s15, s14
-; GFX6-NEXT:    v_mov_b32_e32 v0, s17
+; GFX6-NEXT:    s_add_u32 s18, s15, s14
+; GFX6-NEXT:    v_mov_b32_e32 v0, s18
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
-; GFX6-NEXT:    s_addc_u32 s16, 0, s16
-; GFX6-NEXT:    s_mul_i32 s14, s6, s16
+; GFX6-NEXT:    s_addc_u32 s19, 0, s16
+; GFX6-NEXT:    s_mul_i32 s14, s6, s19
 ; GFX6-NEXT:    v_readfirstlane_b32 s15, v0
 ; GFX6-NEXT:    s_add_i32 s14, s15, s14
-; GFX6-NEXT:    s_mul_i32 s15, s7, s17
-; GFX6-NEXT:    s_add_i32 s18, s14, s15
-; GFX6-NEXT:    s_sub_i32 s19, s9, s18
-; GFX6-NEXT:    s_mul_i32 s14, s6, s17
+; GFX6-NEXT:    s_mul_i32 s15, s7, s18
+; GFX6-NEXT:    s_add_i32 s20, s14, s15
+; GFX6-NEXT:    s_sub_i32 s16, s9, s20
+; GFX6-NEXT:    s_mul_i32 s14, s6, s18
 ; GFX6-NEXT:    s_sub_u32 s8, s8, s14
 ; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT:    s_or_b32 s20, s14, s15
-; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX6-NEXT:    s_subb_u32 s19, s19, s7
-; GFX6-NEXT:    s_sub_u32 s21, s8, s6
-; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT:    s_or_b32 s17, s14, s15
+; GFX6-NEXT:    s_subb_u32 s21, s16, s7
+; GFX6-NEXT:    s_sub_u32 s22, s8, s6
+; GFX6-NEXT:    s_cselect_b64 s[16:17], -1, 0
+; GFX6-NEXT:    s_or_b32 s16, s16, s17
+; GFX6-NEXT:    s_subb_u32 s16, s21, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s16, s7
+; GFX6-NEXT:    s_cselect_b32 s17, -1, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s22, s6
+; GFX6-NEXT:    s_cselect_b32 s21, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s16, s7
+; GFX6-NEXT:    s_cselect_b32 s16, s21, s17
+; GFX6-NEXT:    s_add_u32 s17, s18, 1
+; GFX6-NEXT:    s_addc_u32 s21, s19, 0
+; GFX6-NEXT:    s_add_u32 s22, s18, 2
+; GFX6-NEXT:    s_addc_u32 s23, s19, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX6-NEXT:    s_cselect_b32 s16, s22, s17
+; GFX6-NEXT:    s_cselect_b32 s17, s23, s21
 ; GFX6-NEXT:    s_or_b32 s14, s14, s15
-; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
-; GFX6-NEXT:    s_subb_u32 s14, s19, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s14, s7
-; GFX6-NEXT:    s_cselect_b32 s15, -1, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s21, s6
-; GFX6-NEXT:    s_cselect_b32 s19, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s14, s7
-; GFX6-NEXT:    s_cselect_b32 s14, s19, s15
-; GFX6-NEXT:    s_add_u32 s15, s17, 1
-; GFX6-NEXT:    s_addc_u32 s19, s16, 0
-; GFX6-NEXT:    s_add_u32 s21, s17, 2
-; GFX6-NEXT:    s_addc_u32 s22, s16, 0
-; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
-; GFX6-NEXT:    s_cselect_b32 s14, s21, s15
-; GFX6-NEXT:    s_cselect_b32 s15, s22, s19
-; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX6-NEXT:    s_subb_u32 s9, s9, s18
+; GFX6-NEXT:    s_subb_u32 s9, s9, s20
 ; GFX6-NEXT:    s_cmp_ge_u32 s9, s7
-; GFX6-NEXT:    s_cselect_b32 s18, -1, 0
+; GFX6-NEXT:    s_cselect_b32 s14, -1, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s8, s6
 ; GFX6-NEXT:    s_cselect_b32 s6, -1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s9, s7
-; GFX6-NEXT:    s_cselect_b32 s6, s6, s18
+; GFX6-NEXT:    s_cselect_b32 s6, s6, s14
 ; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX6-NEXT:    s_cselect_b32 s7, s15, s16
-; GFX6-NEXT:    s_cselect_b32 s6, s14, s17
+; GFX6-NEXT:    s_cselect_b32 s7, s17, s19
+; GFX6-NEXT:    s_cselect_b32 s6, s16, s18
 ; GFX6-NEXT:    s_xor_b64 s[2:3], s[12:13], s[2:3]
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[6:7], s[2:3]
-; GFX6-NEXT:    s_sub_u32 s14, s6, s2
-; GFX6-NEXT:    s_subb_u32 s15, s7, s3
+; GFX6-NEXT:    s_sub_u32 s16, s6, s2
+; GFX6-NEXT:    s_subb_u32 s17, s7, s3
 ; GFX6-NEXT:    s_ashr_i32 s6, s1, 31
 ; GFX6-NEXT:    s_add_u32 s0, s0, s6
 ; GFX6-NEXT:    s_mov_b32 s7, s6
@@ -8454,40 +8439,39 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v2, s12, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s16, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
 ; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX6-NEXT:    s_mul_i32 s1, s12, s16
+; GFX6-NEXT:    s_mul_i32 s1, s12, s14
 ; GFX6-NEXT:    v_readfirstlane_b32 s3, v2
 ; GFX6-NEXT:    s_mul_i32 s0, s13, s2
 ; GFX6-NEXT:    s_add_i32 s1, s3, s1
 ; GFX6-NEXT:    s_add_i32 s3, s1, s0
-; GFX6-NEXT:    s_mul_i32 s17, s12, s2
+; GFX6-NEXT:    s_mul_i32 s15, s12, s2
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s3
-; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s17
+; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s15
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; GFX6-NEXT:    s_mul_i32 s4, s2, s3
 ; GFX6-NEXT:    v_readfirstlane_b32 s5, v2
 ; GFX6-NEXT:    v_readfirstlane_b32 s18, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, v1, s17
+; GFX6-NEXT:    v_mul_hi_u32 v0, v1, s15
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s3
 ; GFX6-NEXT:    s_add_u32 s4, s18, s4
 ; GFX6-NEXT:    s_addc_u32 s5, 0, s5
-; GFX6-NEXT:    s_mul_i32 s17, s16, s17
+; GFX6-NEXT:    s_mul_i32 s15, s14, s15
 ; GFX6-NEXT:    v_readfirstlane_b32 s18, v0
-; GFX6-NEXT:    s_add_u32 s4, s4, s17
+; GFX6-NEXT:    s_add_u32 s4, s4, s15
 ; GFX6-NEXT:    s_addc_u32 s4, s5, s18
 ; GFX6-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX6-NEXT:    s_addc_u32 s5, s5, 0
-; GFX6-NEXT:    s_mul_i32 s3, s16, s3
+; GFX6-NEXT:    s_mul_i32 s3, s14, s3
 ; GFX6-NEXT:    s_add_u32 s3, s4, s3
 ; GFX6-NEXT:    s_addc_u32 s4, 0, s5
 ; GFX6-NEXT:    s_add_u32 s5, s2, s3
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s5
-; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s12, v0
+; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX6-NEXT:    s_addc_u32 s4, s16, s4
+; GFX6-NEXT:    s_addc_u32 s4, s14, s4
 ; GFX6-NEXT:    s_mul_i32 s2, s12, s4
 ; GFX6-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX6-NEXT:    s_add_i32 s2, s3, s2
@@ -8501,14 +8485,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s4, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
 ; GFX6-NEXT:    s_mul_i32 s13, s5, s2
-; GFX6-NEXT:    v_readfirstlane_b32 s17, v2
-; GFX6-NEXT:    s_add_u32 s13, s17, s13
-; GFX6-NEXT:    v_readfirstlane_b32 s16, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s15, v2
+; GFX6-NEXT:    s_add_u32 s13, s15, s13
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v0
 ; GFX6-NEXT:    s_mul_i32 s3, s4, s3
-; GFX6-NEXT:    s_addc_u32 s16, 0, s16
+; GFX6-NEXT:    s_addc_u32 s14, 0, s14
 ; GFX6-NEXT:    v_readfirstlane_b32 s12, v3
 ; GFX6-NEXT:    s_add_u32 s3, s13, s3
-; GFX6-NEXT:    s_addc_u32 s3, s16, s12
+; GFX6-NEXT:    s_addc_u32 s3, s14, s12
 ; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
 ; GFX6-NEXT:    s_addc_u32 s12, s12, 0
 ; GFX6-NEXT:    s_mul_i32 s2, s4, s2
@@ -8517,7 +8501,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_add_u32 s13, s5, s2
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX6-NEXT:    s_addc_u32 s12, s4, s12
 ; GFX6-NEXT:    s_ashr_i32 s4, s11, 31
 ; GFX6-NEXT:    s_add_u32 s2, s10, s4
@@ -8529,72 +8512,70 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s13
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v2
 ; GFX6-NEXT:    s_mul_i32 s2, s10, s12
-; GFX6-NEXT:    v_readfirstlane_b32 s16, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s11, v2
-; GFX6-NEXT:    v_readfirstlane_b32 s17, v3
+; GFX6-NEXT:    v_readfirstlane_b32 s15, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s11, v0
-; GFX6-NEXT:    s_add_u32 s2, s17, s2
-; GFX6-NEXT:    s_addc_u32 s16, 0, s16
+; GFX6-NEXT:    s_add_u32 s2, s15, s2
+; GFX6-NEXT:    s_addc_u32 s14, 0, s14
 ; GFX6-NEXT:    s_mul_i32 s13, s11, s13
-; GFX6-NEXT:    v_readfirstlane_b32 s17, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s15, v1
 ; GFX6-NEXT:    s_add_u32 s2, s2, s13
-; GFX6-NEXT:    s_addc_u32 s2, s16, s17
+; GFX6-NEXT:    s_addc_u32 s2, s14, s15
 ; GFX6-NEXT:    v_readfirstlane_b32 s13, v0
 ; GFX6-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX6-NEXT:    s_mul_i32 s12, s11, s12
-; GFX6-NEXT:    s_add_u32 s16, s2, s12
-; GFX6-NEXT:    v_mov_b32_e32 v0, s16
+; GFX6-NEXT:    s_add_u32 s18, s2, s12
+; GFX6-NEXT:    v_mov_b32_e32 v0, s18
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
-; GFX6-NEXT:    s_addc_u32 s17, 0, s13
-; GFX6-NEXT:    s_mul_i32 s12, s8, s17
+; GFX6-NEXT:    s_addc_u32 s19, 0, s13
+; GFX6-NEXT:    s_mul_i32 s12, s8, s19
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    v_readfirstlane_b32 s13, v0
 ; GFX6-NEXT:    s_add_i32 s12, s13, s12
-; GFX6-NEXT:    s_mul_i32 s13, s9, s16
-; GFX6-NEXT:    s_add_i32 s18, s12, s13
-; GFX6-NEXT:    s_sub_i32 s19, s11, s18
-; GFX6-NEXT:    s_mul_i32 s12, s8, s16
+; GFX6-NEXT:    s_mul_i32 s13, s9, s18
+; GFX6-NEXT:    s_add_i32 s20, s12, s13
+; GFX6-NEXT:    s_sub_i32 s14, s11, s20
+; GFX6-NEXT:    s_mul_i32 s12, s8, s18
 ; GFX6-NEXT:    s_sub_u32 s10, s10, s12
 ; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT:    s_or_b32 s20, s12, s13
-; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX6-NEXT:    s_subb_u32 s19, s19, s9
-; GFX6-NEXT:    s_sub_u32 s21, s10, s8
-; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT:    s_or_b32 s15, s12, s13
+; GFX6-NEXT:    s_subb_u32 s21, s14, s9
+; GFX6-NEXT:    s_sub_u32 s22, s10, s8
+; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT:    s_or_b32 s14, s14, s15
+; GFX6-NEXT:    s_subb_u32 s14, s21, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s14, s9
+; GFX6-NEXT:    s_cselect_b32 s15, -1, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s22, s8
+; GFX6-NEXT:    s_cselect_b32 s21, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s14, s9
+; GFX6-NEXT:    s_cselect_b32 s14, s21, s15
+; GFX6-NEXT:    s_add_u32 s15, s18, 1
+; GFX6-NEXT:    s_addc_u32 s21, s19, 0
+; GFX6-NEXT:    s_add_u32 s22, s18, 2
+; GFX6-NEXT:    s_addc_u32 s23, s19, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX6-NEXT:    s_cselect_b32 s14, s22, s15
+; GFX6-NEXT:    s_cselect_b32 s15, s23, s21
 ; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_subb_u32 s12, s19, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s12, s9
-; GFX6-NEXT:    s_cselect_b32 s13, -1, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s21, s8
-; GFX6-NEXT:    s_cselect_b32 s19, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s12, s9
-; GFX6-NEXT:    s_cselect_b32 s12, s19, s13
-; GFX6-NEXT:    s_add_u32 s13, s16, 1
-; GFX6-NEXT:    s_addc_u32 s19, s17, 0
-; GFX6-NEXT:    s_add_u32 s21, s16, 2
-; GFX6-NEXT:    s_addc_u32 s22, s17, 0
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_cselect_b32 s12, s21, s13
-; GFX6-NEXT:    s_cselect_b32 s13, s22, s19
-; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX6-NEXT:    s_subb_u32 s11, s11, s18
+; GFX6-NEXT:    s_subb_u32 s11, s11, s20
 ; GFX6-NEXT:    s_cmp_ge_u32 s11, s9
-; GFX6-NEXT:    s_cselect_b32 s18, -1, 0
+; GFX6-NEXT:    s_cselect_b32 s12, -1, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s10, s8
 ; GFX6-NEXT:    s_cselect_b32 s8, -1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s11, s9
-; GFX6-NEXT:    s_cselect_b32 s8, s8, s18
+; GFX6-NEXT:    s_cselect_b32 s8, s8, s12
 ; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
-; GFX6-NEXT:    s_cselect_b32 s9, s13, s17
-; GFX6-NEXT:    s_cselect_b32 s8, s12, s16
+; GFX6-NEXT:    s_cselect_b32 s9, s15, s19
+; GFX6-NEXT:    s_cselect_b32 s8, s14, s18
 ; GFX6-NEXT:    s_xor_b64 s[4:5], s[4:5], s[6:7]
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[8:9], s[4:5]
 ; GFX6-NEXT:    s_sub_u32 s4, s6, s4
 ; GFX6-NEXT:    s_subb_u32 s5, s7, s5
 ; GFX6-NEXT:    s_mov_b32 s2, -1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s14
-; GFX6-NEXT:    v_mov_b32_e32 v1, s15
+; GFX6-NEXT:    v_mov_b32_e32 v0, s16
+; GFX6-NEXT:    v_mov_b32_e32 v1, s17
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
@@ -8614,8 +8595,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_xor_b64 s[6:7], s[6:7], s[2:3]
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
-; GFX9-NEXT:    s_sub_u32 s14, 0, s6
-; GFX9-NEXT:    s_subb_u32 s15, 0, s7
+; GFX9-NEXT:    s_sub_u32 s12, 0, s6
+; GFX9-NEXT:    s_subb_u32 s13, 0, s7
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -8624,56 +8605,52 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s16, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s12, v0
-; GFX9-NEXT:    s_mul_i32 s13, s14, s16
-; GFX9-NEXT:    s_mul_hi_u32 s18, s14, s12
-; GFX9-NEXT:    s_mul_i32 s17, s15, s12
-; GFX9-NEXT:    s_add_i32 s13, s18, s13
-; GFX9-NEXT:    s_mul_i32 s19, s14, s12
-; GFX9-NEXT:    s_add_i32 s13, s13, s17
-; GFX9-NEXT:    s_mul_hi_u32 s18, s12, s19
-; GFX9-NEXT:    s_mul_i32 s20, s12, s13
-; GFX9-NEXT:    s_mul_hi_u32 s17, s12, s13
+; GFX9-NEXT:    v_readfirstlane_b32 s14, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s15, v0
+; GFX9-NEXT:    s_mul_i32 s16, s12, s14
+; GFX9-NEXT:    s_mul_hi_u32 s18, s12, s15
+; GFX9-NEXT:    s_mul_i32 s17, s13, s15
+; GFX9-NEXT:    s_add_i32 s16, s18, s16
+; GFX9-NEXT:    s_mul_i32 s19, s12, s15
+; GFX9-NEXT:    s_add_i32 s16, s16, s17
+; GFX9-NEXT:    s_mul_hi_u32 s18, s15, s19
+; GFX9-NEXT:    s_mul_i32 s20, s15, s16
+; GFX9-NEXT:    s_mul_hi_u32 s17, s15, s16
 ; GFX9-NEXT:    s_add_u32 s18, s18, s20
 ; GFX9-NEXT:    s_addc_u32 s17, 0, s17
-; GFX9-NEXT:    s_mul_hi_u32 s20, s16, s19
-; GFX9-NEXT:    s_mul_i32 s19, s16, s19
+; GFX9-NEXT:    s_mul_hi_u32 s20, s14, s19
+; GFX9-NEXT:    s_mul_i32 s19, s14, s19
 ; GFX9-NEXT:    s_add_u32 s18, s18, s19
-; GFX9-NEXT:    s_mul_hi_u32 s21, s16, s13
+; GFX9-NEXT:    s_mul_hi_u32 s21, s14, s16
 ; GFX9-NEXT:    s_addc_u32 s17, s17, s20
 ; GFX9-NEXT:    s_addc_u32 s18, s21, 0
-; GFX9-NEXT:    s_mul_i32 s13, s16, s13
-; GFX9-NEXT:    s_add_u32 s13, s17, s13
+; GFX9-NEXT:    s_mul_i32 s16, s14, s16
+; GFX9-NEXT:    s_add_u32 s16, s17, s16
 ; GFX9-NEXT:    s_addc_u32 s17, 0, s18
-; GFX9-NEXT:    s_add_u32 s18, s12, s13
-; GFX9-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[12:13], 0
-; GFX9-NEXT:    s_addc_u32 s16, s16, s17
-; GFX9-NEXT:    s_mul_i32 s12, s14, s16
-; GFX9-NEXT:    s_mul_hi_u32 s13, s14, s18
-; GFX9-NEXT:    s_add_i32 s12, s13, s12
-; GFX9-NEXT:    s_mul_i32 s15, s15, s18
-; GFX9-NEXT:    s_add_i32 s12, s12, s15
-; GFX9-NEXT:    s_mul_i32 s14, s14, s18
-; GFX9-NEXT:    s_mul_hi_u32 s15, s16, s14
-; GFX9-NEXT:    s_mul_i32 s17, s16, s14
-; GFX9-NEXT:    s_mul_i32 s20, s18, s12
-; GFX9-NEXT:    s_mul_hi_u32 s14, s18, s14
-; GFX9-NEXT:    s_mul_hi_u32 s19, s18, s12
-; GFX9-NEXT:    s_add_u32 s14, s14, s20
+; GFX9-NEXT:    s_add_u32 s15, s15, s16
+; GFX9-NEXT:    s_addc_u32 s14, s14, s17
+; GFX9-NEXT:    s_mul_i32 s16, s12, s14
+; GFX9-NEXT:    s_mul_hi_u32 s17, s12, s15
+; GFX9-NEXT:    s_add_i32 s16, s17, s16
+; GFX9-NEXT:    s_mul_i32 s13, s13, s15
+; GFX9-NEXT:    s_add_i32 s16, s16, s13
+; GFX9-NEXT:    s_mul_i32 s12, s12, s15
+; GFX9-NEXT:    s_mul_hi_u32 s17, s14, s12
+; GFX9-NEXT:    s_mul_i32 s18, s14, s12
+; GFX9-NEXT:    s_mul_i32 s20, s15, s16
+; GFX9-NEXT:    s_mul_hi_u32 s12, s15, s12
+; GFX9-NEXT:    s_mul_hi_u32 s19, s15, s16
+; GFX9-NEXT:    s_add_u32 s12, s12, s20
 ; GFX9-NEXT:    s_addc_u32 s19, 0, s19
-; GFX9-NEXT:    s_add_u32 s14, s14, s17
-; GFX9-NEXT:    s_mul_hi_u32 s13, s16, s12
-; GFX9-NEXT:    s_addc_u32 s14, s19, s15
+; GFX9-NEXT:    s_add_u32 s12, s12, s18
+; GFX9-NEXT:    s_mul_hi_u32 s13, s14, s16
+; GFX9-NEXT:    s_addc_u32 s12, s19, s17
 ; GFX9-NEXT:    s_addc_u32 s13, s13, 0
-; GFX9-NEXT:    s_mul_i32 s12, s16, s12
-; GFX9-NEXT:    s_add_u32 s12, s14, s12
-; GFX9-NEXT:    s_addc_u32 s14, 0, s13
-; GFX9-NEXT:    s_add_u32 s15, s18, s12
-; GFX9-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[12:13], 0
-; GFX9-NEXT:    s_addc_u32 s14, s16, s14
+; GFX9-NEXT:    s_mul_i32 s16, s14, s16
+; GFX9-NEXT:    s_add_u32 s12, s12, s16
+; GFX9-NEXT:    s_addc_u32 s13, 0, s13
+; GFX9-NEXT:    s_add_u32 s15, s15, s12
+; GFX9-NEXT:    s_addc_u32 s14, s14, s13
 ; GFX9-NEXT:    s_ashr_i32 s12, s9, 31
 ; GFX9-NEXT:    s_add_u32 s8, s8, s12
 ; GFX9-NEXT:    s_mov_b32 s13, s12
@@ -8691,38 +8668,35 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_addc_u32 s15, s16, s19
 ; GFX9-NEXT:    s_addc_u32 s16, s18, 0
 ; GFX9-NEXT:    s_mul_i32 s14, s9, s14
-; GFX9-NEXT:    s_add_u32 s18, s15, s14
-; GFX9-NEXT:    s_addc_u32 s19, 0, s16
-; GFX9-NEXT:    s_mul_i32 s14, s6, s19
-; GFX9-NEXT:    s_mul_hi_u32 s15, s6, s18
+; GFX9-NEXT:    s_add_u32 s17, s15, s14
+; GFX9-NEXT:    s_addc_u32 s16, 0, s16
+; GFX9-NEXT:    s_mul_i32 s14, s6, s16
+; GFX9-NEXT:    s_mul_hi_u32 s15, s6, s17
 ; GFX9-NEXT:    s_add_i32 s14, s15, s14
-; GFX9-NEXT:    s_mul_i32 s15, s7, s18
-; GFX9-NEXT:    s_add_i32 s20, s14, s15
-; GFX9-NEXT:    s_sub_i32 s16, s9, s20
-; GFX9-NEXT:    s_mul_i32 s14, s6, s18
+; GFX9-NEXT:    s_mul_i32 s15, s7, s17
+; GFX9-NEXT:    s_add_i32 s18, s14, s15
+; GFX9-NEXT:    s_sub_i32 s19, s9, s18
+; GFX9-NEXT:    s_mul_i32 s14, s6, s17
 ; GFX9-NEXT:    s_sub_u32 s8, s8, s14
 ; GFX9-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[14:15], 0
-; GFX9-NEXT:    s_subb_u32 s21, s16, s7
-; GFX9-NEXT:    s_sub_u32 s22, s8, s6
-; GFX9-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[16:17], 0
-; GFX9-NEXT:    s_subb_u32 s16, s21, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s16, s7
-; GFX9-NEXT:    s_cselect_b32 s17, -1, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s22, s6
+; GFX9-NEXT:    s_subb_u32 s19, s19, s7
+; GFX9-NEXT:    s_sub_u32 s20, s8, s6
+; GFX9-NEXT:    s_subb_u32 s19, s19, 0
+; GFX9-NEXT:    s_cmp_ge_u32 s19, s7
 ; GFX9-NEXT:    s_cselect_b32 s21, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s16, s7
-; GFX9-NEXT:    s_cselect_b32 s16, s21, s17
-; GFX9-NEXT:    s_add_u32 s17, s18, 1
-; GFX9-NEXT:    s_addc_u32 s21, s19, 0
-; GFX9-NEXT:    s_add_u32 s22, s18, 2
-; GFX9-NEXT:    s_addc_u32 s23, s19, 0
-; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX9-NEXT:    s_cselect_b32 s16, s22, s17
-; GFX9-NEXT:    s_cselect_b32 s17, s23, s21
+; GFX9-NEXT:    s_cmp_ge_u32 s20, s6
+; GFX9-NEXT:    s_cselect_b32 s20, -1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s19, s7
+; GFX9-NEXT:    s_cselect_b32 s19, s20, s21
+; GFX9-NEXT:    s_add_u32 s20, s17, 1
+; GFX9-NEXT:    s_addc_u32 s21, s16, 0
+; GFX9-NEXT:    s_add_u32 s22, s17, 2
+; GFX9-NEXT:    s_addc_u32 s23, s16, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX9-NEXT:    s_cselect_b32 s19, s22, s20
+; GFX9-NEXT:    s_cselect_b32 s20, s23, s21
 ; GFX9-NEXT:    s_cmp_lg_u64 s[14:15], 0
-; GFX9-NEXT:    s_subb_u32 s9, s9, s20
+; GFX9-NEXT:    s_subb_u32 s9, s9, s18
 ; GFX9-NEXT:    s_cmp_ge_u32 s9, s7
 ; GFX9-NEXT:    s_cselect_b32 s14, -1, 0
 ; GFX9-NEXT:    s_cmp_ge_u32 s8, s6
@@ -8730,12 +8704,12 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_cmp_eq_u32 s9, s7
 ; GFX9-NEXT:    s_cselect_b32 s6, s6, s14
 ; GFX9-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX9-NEXT:    s_cselect_b32 s7, s17, s19
-; GFX9-NEXT:    s_cselect_b32 s6, s16, s18
+; GFX9-NEXT:    s_cselect_b32 s7, s20, s16
+; GFX9-NEXT:    s_cselect_b32 s6, s19, s17
 ; GFX9-NEXT:    s_xor_b64 s[2:3], s[12:13], s[2:3]
 ; GFX9-NEXT:    s_xor_b64 s[6:7], s[6:7], s[2:3]
-; GFX9-NEXT:    s_sub_u32 s14, s6, s2
-; GFX9-NEXT:    s_subb_u32 s15, s7, s3
+; GFX9-NEXT:    s_sub_u32 s12, s6, s2
+; GFX9-NEXT:    s_subb_u32 s13, s7, s3
 ; GFX9-NEXT:    s_ashr_i32 s2, s1, 31
 ; GFX9-NEXT:    s_add_u32 s0, s0, s2
 ; GFX9-NEXT:    s_mov_b32 s3, s2
@@ -8744,8 +8718,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT:    s_sub_u32 s8, 0, s6
-; GFX9-NEXT:    s_subb_u32 s9, 0, s7
+; GFX9-NEXT:    s_sub_u32 s4, 0, s6
+; GFX9-NEXT:    s_subb_u32 s5, 0, s7
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -8755,105 +8729,98 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s13, v2
-; GFX9-NEXT:    s_mul_hi_u32 s12, s8, s4
-; GFX9-NEXT:    s_mul_i32 s16, s8, s13
-; GFX9-NEXT:    s_mul_i32 s5, s9, s4
-; GFX9-NEXT:    s_add_i32 s12, s12, s16
-; GFX9-NEXT:    s_add_i32 s12, s12, s5
-; GFX9-NEXT:    s_mul_i32 s17, s8, s4
-; GFX9-NEXT:    s_mul_i32 s16, s4, s12
-; GFX9-NEXT:    s_mul_hi_u32 s18, s4, s17
-; GFX9-NEXT:    s_mul_hi_u32 s5, s4, s12
+; GFX9-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s15, v2
+; GFX9-NEXT:    s_mul_hi_u32 s14, s4, s8
+; GFX9-NEXT:    s_mul_i32 s16, s4, s15
+; GFX9-NEXT:    s_mul_i32 s9, s5, s8
+; GFX9-NEXT:    s_add_i32 s14, s14, s16
+; GFX9-NEXT:    s_add_i32 s14, s14, s9
+; GFX9-NEXT:    s_mul_i32 s17, s4, s8
+; GFX9-NEXT:    s_mul_i32 s16, s8, s14
+; GFX9-NEXT:    s_mul_hi_u32 s18, s8, s17
+; GFX9-NEXT:    s_mul_hi_u32 s9, s8, s14
 ; GFX9-NEXT:    s_add_u32 s16, s18, s16
-; GFX9-NEXT:    s_addc_u32 s5, 0, s5
-; GFX9-NEXT:    s_mul_hi_u32 s19, s13, s17
-; GFX9-NEXT:    s_mul_i32 s17, s13, s17
+; GFX9-NEXT:    s_addc_u32 s9, 0, s9
+; GFX9-NEXT:    s_mul_hi_u32 s19, s15, s17
+; GFX9-NEXT:    s_mul_i32 s17, s15, s17
 ; GFX9-NEXT:    s_add_u32 s16, s16, s17
-; GFX9-NEXT:    s_mul_hi_u32 s18, s13, s12
-; GFX9-NEXT:    s_addc_u32 s5, s5, s19
+; GFX9-NEXT:    s_mul_hi_u32 s18, s15, s14
+; GFX9-NEXT:    s_addc_u32 s9, s9, s19
 ; GFX9-NEXT:    s_addc_u32 s16, s18, 0
-; GFX9-NEXT:    s_mul_i32 s12, s13, s12
-; GFX9-NEXT:    s_add_u32 s5, s5, s12
-; GFX9-NEXT:    s_addc_u32 s12, 0, s16
-; GFX9-NEXT:    s_add_u32 s16, s4, s5
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s12, s13, s12
-; GFX9-NEXT:    s_mul_i32 s4, s8, s12
-; GFX9-NEXT:    s_mul_hi_u32 s5, s8, s16
-; GFX9-NEXT:    s_add_i32 s4, s5, s4
-; GFX9-NEXT:    s_mul_i32 s9, s9, s16
-; GFX9-NEXT:    s_add_i32 s4, s4, s9
-; GFX9-NEXT:    s_mul_i32 s8, s8, s16
-; GFX9-NEXT:    s_mul_hi_u32 s9, s12, s8
-; GFX9-NEXT:    s_mul_i32 s13, s12, s8
-; GFX9-NEXT:    s_mul_i32 s18, s16, s4
-; GFX9-NEXT:    s_mul_hi_u32 s8, s16, s8
-; GFX9-NEXT:    s_mul_hi_u32 s17, s16, s4
-; GFX9-NEXT:    s_add_u32 s8, s8, s18
+; GFX9-NEXT:    s_mul_i32 s14, s15, s14
+; GFX9-NEXT:    s_add_u32 s9, s9, s14
+; GFX9-NEXT:    s_addc_u32 s14, 0, s16
+; GFX9-NEXT:    s_add_u32 s8, s8, s9
+; GFX9-NEXT:    s_addc_u32 s9, s15, s14
+; GFX9-NEXT:    s_mul_i32 s14, s4, s9
+; GFX9-NEXT:    s_mul_hi_u32 s15, s4, s8
+; GFX9-NEXT:    s_add_i32 s14, s15, s14
+; GFX9-NEXT:    s_mul_i32 s5, s5, s8
+; GFX9-NEXT:    s_add_i32 s14, s14, s5
+; GFX9-NEXT:    s_mul_i32 s4, s4, s8
+; GFX9-NEXT:    s_mul_hi_u32 s15, s9, s4
+; GFX9-NEXT:    s_mul_i32 s16, s9, s4
+; GFX9-NEXT:    s_mul_i32 s18, s8, s14
+; GFX9-NEXT:    s_mul_hi_u32 s4, s8, s4
+; GFX9-NEXT:    s_mul_hi_u32 s17, s8, s14
+; GFX9-NEXT:    s_add_u32 s4, s4, s18
 ; GFX9-NEXT:    s_addc_u32 s17, 0, s17
-; GFX9-NEXT:    s_add_u32 s8, s8, s13
-; GFX9-NEXT:    s_mul_hi_u32 s5, s12, s4
-; GFX9-NEXT:    s_addc_u32 s8, s17, s9
+; GFX9-NEXT:    s_add_u32 s4, s4, s16
+; GFX9-NEXT:    s_mul_hi_u32 s5, s9, s14
+; GFX9-NEXT:    s_addc_u32 s4, s17, s15
 ; GFX9-NEXT:    s_addc_u32 s5, s5, 0
-; GFX9-NEXT:    s_mul_i32 s4, s12, s4
-; GFX9-NEXT:    s_add_u32 s4, s8, s4
-; GFX9-NEXT:    s_addc_u32 s8, 0, s5
-; GFX9-NEXT:    s_add_u32 s13, s16, s4
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s12, s12, s8
+; GFX9-NEXT:    s_mul_i32 s14, s9, s14
+; GFX9-NEXT:    s_add_u32 s4, s4, s14
+; GFX9-NEXT:    s_addc_u32 s5, 0, s5
+; GFX9-NEXT:    s_add_u32 s14, s8, s4
+; GFX9-NEXT:    s_addc_u32 s15, s9, s5
 ; GFX9-NEXT:    s_ashr_i32 s4, s11, 31
 ; GFX9-NEXT:    s_add_u32 s8, s10, s4
 ; GFX9-NEXT:    s_mov_b32 s5, s4
 ; GFX9-NEXT:    s_addc_u32 s9, s11, s4
 ; GFX9-NEXT:    s_xor_b64 s[8:9], s[8:9], s[4:5]
-; GFX9-NEXT:    s_mul_i32 s11, s8, s12
-; GFX9-NEXT:    s_mul_hi_u32 s16, s8, s13
-; GFX9-NEXT:    s_mul_hi_u32 s10, s8, s12
+; GFX9-NEXT:    s_mul_i32 s11, s8, s15
+; GFX9-NEXT:    s_mul_hi_u32 s16, s8, s14
+; GFX9-NEXT:    s_mul_hi_u32 s10, s8, s15
 ; GFX9-NEXT:    s_add_u32 s11, s16, s11
 ; GFX9-NEXT:    s_addc_u32 s10, 0, s10
-; GFX9-NEXT:    s_mul_hi_u32 s17, s9, s13
-; GFX9-NEXT:    s_mul_i32 s13, s9, s13
-; GFX9-NEXT:    s_add_u32 s11, s11, s13
-; GFX9-NEXT:    s_mul_hi_u32 s16, s9, s12
+; GFX9-NEXT:    s_mul_hi_u32 s17, s9, s14
+; GFX9-NEXT:    s_mul_i32 s14, s9, s14
+; GFX9-NEXT:    s_add_u32 s11, s11, s14
+; GFX9-NEXT:    s_mul_hi_u32 s16, s9, s15
 ; GFX9-NEXT:    s_addc_u32 s10, s10, s17
 ; GFX9-NEXT:    s_addc_u32 s11, s16, 0
-; GFX9-NEXT:    s_mul_i32 s12, s9, s12
-; GFX9-NEXT:    s_add_u32 s16, s10, s12
-; GFX9-NEXT:    s_addc_u32 s17, 0, s11
-; GFX9-NEXT:    s_mul_i32 s10, s6, s17
-; GFX9-NEXT:    s_mul_hi_u32 s11, s6, s16
+; GFX9-NEXT:    s_mul_i32 s14, s9, s15
+; GFX9-NEXT:    s_add_u32 s14, s10, s14
+; GFX9-NEXT:    s_addc_u32 s15, 0, s11
+; GFX9-NEXT:    s_mul_i32 s10, s6, s15
+; GFX9-NEXT:    s_mul_hi_u32 s11, s6, s14
 ; GFX9-NEXT:    s_add_i32 s10, s11, s10
-; GFX9-NEXT:    s_mul_i32 s11, s7, s16
-; GFX9-NEXT:    s_add_i32 s18, s10, s11
-; GFX9-NEXT:    s_sub_i32 s12, s9, s18
-; GFX9-NEXT:    s_mul_i32 s10, s6, s16
+; GFX9-NEXT:    s_mul_i32 s11, s7, s14
+; GFX9-NEXT:    s_add_i32 s16, s10, s11
+; GFX9-NEXT:    s_sub_i32 s17, s9, s16
+; GFX9-NEXT:    s_mul_i32 s10, s6, s14
 ; GFX9-NEXT:    s_sub_u32 s8, s8, s10
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT:    s_subb_u32 s19, s12, s7
-; GFX9-NEXT:    s_sub_u32 s20, s8, s6
-; GFX9-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[12:13], 0
-; GFX9-NEXT:    s_subb_u32 s12, s19, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s12, s7
-; GFX9-NEXT:    s_cselect_b32 s13, -1, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s20, s6
+; GFX9-NEXT:    s_subb_u32 s17, s17, s7
+; GFX9-NEXT:    s_sub_u32 s18, s8, s6
+; GFX9-NEXT:    s_subb_u32 s17, s17, 0
+; GFX9-NEXT:    s_cmp_ge_u32 s17, s7
 ; GFX9-NEXT:    s_cselect_b32 s19, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s12, s7
-; GFX9-NEXT:    s_cselect_b32 s12, s19, s13
-; GFX9-NEXT:    s_add_u32 s13, s16, 1
-; GFX9-NEXT:    s_addc_u32 s19, s17, 0
-; GFX9-NEXT:    s_add_u32 s20, s16, 2
-; GFX9-NEXT:    s_addc_u32 s21, s17, 0
-; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX9-NEXT:    s_cselect_b32 s12, s20, s13
-; GFX9-NEXT:    s_cselect_b32 s13, s21, s19
+; GFX9-NEXT:    s_cmp_ge_u32 s18, s6
+; GFX9-NEXT:    s_cselect_b32 s18, -1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s17, s7
+; GFX9-NEXT:    s_cselect_b32 s17, s18, s19
+; GFX9-NEXT:    s_add_u32 s18, s14, 1
+; GFX9-NEXT:    s_addc_u32 s19, s15, 0
+; GFX9-NEXT:    s_add_u32 s20, s14, 2
+; GFX9-NEXT:    s_addc_u32 s21, s15, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX9-NEXT:    s_cselect_b32 s17, s20, s18
+; GFX9-NEXT:    s_cselect_b32 s18, s21, s19
 ; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT:    s_subb_u32 s9, s9, s18
+; GFX9-NEXT:    s_subb_u32 s9, s9, s16
 ; GFX9-NEXT:    s_cmp_ge_u32 s9, s7
 ; GFX9-NEXT:    s_cselect_b32 s10, -1, 0
 ; GFX9-NEXT:    s_cmp_ge_u32 s8, s6
@@ -8861,14 +8828,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_cmp_eq_u32 s9, s7
 ; GFX9-NEXT:    s_cselect_b32 s6, s6, s10
 ; GFX9-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX9-NEXT:    s_cselect_b32 s7, s13, s17
-; GFX9-NEXT:    s_cselect_b32 s6, s12, s16
+; GFX9-NEXT:    s_cselect_b32 s7, s18, s15
+; GFX9-NEXT:    s_cselect_b32 s6, s17, s14
 ; GFX9-NEXT:    s_xor_b64 s[2:3], s[4:5], s[2:3]
 ; GFX9-NEXT:    s_xor_b64 s[4:5], s[6:7], s[2:3]
 ; GFX9-NEXT:    s_sub_u32 s2, s4, s2
 ; GFX9-NEXT:    s_subb_u32 s3, s5, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s14
-; GFX9-NEXT:    v_mov_b32_e32 v2, s15
+; GFX9-NEXT:    v_mov_b32_e32 v1, s12
+; GFX9-NEXT:    v_mov_b32_e32 v2, s13
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9089,10 +9056,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_addc_u32 s13, 0, s14
 ; GFX6-NEXT:    s_add_u32 s14, s0, s1
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s14
-; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s10, v0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX6-NEXT:    s_addc_u32 s12, s12, s13
 ; GFX6-NEXT:    s_mul_i32 s0, s10, s12
 ; GFX6-NEXT:    v_readfirstlane_b32 s1, v0
@@ -9123,7 +9089,6 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_add_u32 s13, s14, s0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX6-NEXT:    s_addc_u32 s12, s12, s10
 ; GFX6-NEXT:    s_ashr_i32 s10, s7, 31
 ; GFX6-NEXT:    s_add_u32 s0, s6, s10
@@ -9158,46 +9123,43 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    v_readfirstlane_b32 s5, v0
 ; GFX6-NEXT:    s_add_i32 s4, s5, s4
 ; GFX6-NEXT:    s_mul_i32 s5, s9, s12
-; GFX6-NEXT:    s_add_i32 s13, s4, s5
-; GFX6-NEXT:    s_sub_i32 s14, s7, s13
+; GFX6-NEXT:    s_add_i32 s14, s4, s5
+; GFX6-NEXT:    s_sub_i32 s13, s7, s14
 ; GFX6-NEXT:    s_mul_i32 s4, s8, s12
 ; GFX6-NEXT:    s_sub_u32 s6, s6, s4
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX6-NEXT:    s_or_b32 s12, s4, s5
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_subb_u32 s14, s14, s9
-; GFX6-NEXT:    s_sub_u32 s15, s6, s8
-; GFX6-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX6-NEXT:    s_subb_u32 s15, s13, s9
+; GFX6-NEXT:    s_sub_u32 s16, s6, s8
+; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT:    s_or_b32 s17, s12, s13
+; GFX6-NEXT:    s_subb_u32 s17, s15, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s17, s9
+; GFX6-NEXT:    s_cselect_b32 s18, -1, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s16, s8
+; GFX6-NEXT:    s_cselect_b32 s19, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s17, s9
+; GFX6-NEXT:    s_cselect_b32 s18, s19, s18
+; GFX6-NEXT:    s_or_b32 s12, s12, s13
+; GFX6-NEXT:    s_subb_u32 s15, s15, s9
+; GFX6-NEXT:    s_sub_u32 s19, s16, s8
+; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT:    s_or_b32 s12, s12, s13
+; GFX6-NEXT:    s_subb_u32 s12, s15, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX6-NEXT:    s_cselect_b32 s13, s19, s16
+; GFX6-NEXT:    s_cselect_b32 s12, s12, s17
 ; GFX6-NEXT:    s_or_b32 s4, s4, s5
-; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX6-NEXT:    s_subb_u32 s16, s14, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s16, s9
+; GFX6-NEXT:    s_subb_u32 s4, s7, s14
+; GFX6-NEXT:    s_cmp_ge_u32 s4, s9
 ; GFX6-NEXT:    s_cselect_b32 s5, -1, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s15, s8
-; GFX6-NEXT:    s_cselect_b32 s17, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s16, s9
-; GFX6-NEXT:    s_cselect_b32 s17, s17, s5
-; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX6-NEXT:    s_subb_u32 s14, s14, s9
-; GFX6-NEXT:    s_sub_u32 s18, s15, s8
-; GFX6-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX6-NEXT:    s_or_b32 s4, s4, s5
-; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX6-NEXT:    s_subb_u32 s4, s14, 0
-; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
-; GFX6-NEXT:    s_cselect_b32 s14, s18, s15
-; GFX6-NEXT:    s_cselect_b32 s4, s4, s16
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_subb_u32 s5, s7, s13
-; GFX6-NEXT:    s_cmp_ge_u32 s5, s9
-; GFX6-NEXT:    s_cselect_b32 s7, -1, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s6, s8
-; GFX6-NEXT:    s_cselect_b32 s8, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s5, s9
-; GFX6-NEXT:    s_cselect_b32 s7, s8, s7
-; GFX6-NEXT:    s_cmp_lg_u32 s7, 0
-; GFX6-NEXT:    s_cselect_b32 s5, s4, s5
-; GFX6-NEXT:    s_cselect_b32 s4, s14, s6
+; GFX6-NEXT:    s_cselect_b32 s7, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s4, s9
+; GFX6-NEXT:    s_cselect_b32 s5, s7, s5
+; GFX6-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX6-NEXT:    s_cselect_b32 s5, s12, s4
+; GFX6-NEXT:    s_cselect_b32 s4, s13, s6
 ; GFX6-NEXT:    s_xor_b64 s[4:5], s[4:5], s[10:11]
 ; GFX6-NEXT:    s_sub_u32 s4, s4, s10
 ; GFX6-NEXT:    s_subb_u32 s5, s5, s10
@@ -9219,8 +9181,8 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    s_sub_u32 s8, 0, s6
-; GFX9-NEXT:    s_subb_u32 s9, 0, s7
+; GFX9-NEXT:    s_sub_u32 s4, 0, s6
+; GFX9-NEXT:    s_subb_u32 s5, 0, s7
 ; GFX9-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GFX9-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -9230,56 +9192,52 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    v_madmk_f32 v1, v2, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX9-NEXT:    s_mul_i32 s5, s8, s10
-; GFX9-NEXT:    s_mul_hi_u32 s12, s8, s4
-; GFX9-NEXT:    s_mul_i32 s11, s9, s4
-; GFX9-NEXT:    s_add_i32 s5, s12, s5
-; GFX9-NEXT:    s_mul_i32 s13, s8, s4
-; GFX9-NEXT:    s_add_i32 s5, s5, s11
-; GFX9-NEXT:    s_mul_hi_u32 s12, s4, s13
-; GFX9-NEXT:    s_mul_i32 s14, s4, s5
-; GFX9-NEXT:    s_mul_hi_u32 s11, s4, s5
+; GFX9-NEXT:    v_readfirstlane_b32 s8, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX9-NEXT:    s_mul_i32 s10, s4, s8
+; GFX9-NEXT:    s_mul_hi_u32 s12, s4, s9
+; GFX9-NEXT:    s_mul_i32 s11, s5, s9
+; GFX9-NEXT:    s_add_i32 s10, s12, s10
+; GFX9-NEXT:    s_mul_i32 s13, s4, s9
+; GFX9-NEXT:    s_add_i32 s10, s10, s11
+; GFX9-NEXT:    s_mul_hi_u32 s12, s9, s13
+; GFX9-NEXT:    s_mul_i32 s14, s9, s10
+; GFX9-NEXT:    s_mul_hi_u32 s11, s9, s10
 ; GFX9-NEXT:    s_add_u32 s12, s12, s14
 ; GFX9-NEXT:    s_addc_u32 s11, 0, s11
-; GFX9-NEXT:    s_mul_hi_u32 s15, s10, s13
-; GFX9-NEXT:    s_mul_i32 s13, s10, s13
+; GFX9-NEXT:    s_mul_hi_u32 s15, s8, s13
+; GFX9-NEXT:    s_mul_i32 s13, s8, s13
 ; GFX9-NEXT:    s_add_u32 s12, s12, s13
-; GFX9-NEXT:    s_mul_hi_u32 s14, s10, s5
+; GFX9-NEXT:    s_mul_hi_u32 s14, s8, s10
 ; GFX9-NEXT:    s_addc_u32 s11, s11, s15
 ; GFX9-NEXT:    s_addc_u32 s12, s14, 0
-; GFX9-NEXT:    s_mul_i32 s5, s10, s5
-; GFX9-NEXT:    s_add_u32 s5, s11, s5
+; GFX9-NEXT:    s_mul_i32 s10, s8, s10
+; GFX9-NEXT:    s_add_u32 s10, s11, s10
 ; GFX9-NEXT:    s_addc_u32 s11, 0, s12
-; GFX9-NEXT:    s_add_u32 s12, s4, s5
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s10, s10, s11
-; GFX9-NEXT:    s_mul_i32 s4, s8, s10
-; GFX9-NEXT:    s_mul_hi_u32 s5, s8, s12
-; GFX9-NEXT:    s_add_i32 s4, s5, s4
-; GFX9-NEXT:    s_mul_i32 s9, s9, s12
-; GFX9-NEXT:    s_add_i32 s4, s4, s9
-; GFX9-NEXT:    s_mul_i32 s8, s8, s12
-; GFX9-NEXT:    s_mul_hi_u32 s9, s10, s8
-; GFX9-NEXT:    s_mul_i32 s11, s10, s8
-; GFX9-NEXT:    s_mul_i32 s14, s12, s4
-; GFX9-NEXT:    s_mul_hi_u32 s8, s12, s8
-; GFX9-NEXT:    s_mul_hi_u32 s13, s12, s4
-; GFX9-NEXT:    s_add_u32 s8, s8, s14
+; GFX9-NEXT:    s_add_u32 s9, s9, s10
+; GFX9-NEXT:    s_addc_u32 s8, s8, s11
+; GFX9-NEXT:    s_mul_i32 s10, s4, s8
+; GFX9-NEXT:    s_mul_hi_u32 s11, s4, s9
+; GFX9-NEXT:    s_add_i32 s10, s11, s10
+; GFX9-NEXT:    s_mul_i32 s5, s5, s9
+; GFX9-NEXT:    s_add_i32 s10, s10, s5
+; GFX9-NEXT:    s_mul_i32 s4, s4, s9
+; GFX9-NEXT:    s_mul_hi_u32 s11, s8, s4
+; GFX9-NEXT:    s_mul_i32 s12, s8, s4
+; GFX9-NEXT:    s_mul_i32 s14, s9, s10
+; GFX9-NEXT:    s_mul_hi_u32 s4, s9, s4
+; GFX9-NEXT:    s_mul_hi_u32 s13, s9, s10
+; GFX9-NEXT:    s_add_u32 s4, s4, s14
 ; GFX9-NEXT:    s_addc_u32 s13, 0, s13
-; GFX9-NEXT:    s_add_u32 s8, s8, s11
-; GFX9-NEXT:    s_mul_hi_u32 s5, s10, s4
-; GFX9-NEXT:    s_addc_u32 s8, s13, s9
+; GFX9-NEXT:    s_add_u32 s4, s4, s12
+; GFX9-NEXT:    s_mul_hi_u32 s5, s8, s10
+; GFX9-NEXT:    s_addc_u32 s4, s13, s11
 ; GFX9-NEXT:    s_addc_u32 s5, s5, 0
-; GFX9-NEXT:    s_mul_i32 s4, s10, s4
-; GFX9-NEXT:    s_add_u32 s4, s8, s4
-; GFX9-NEXT:    s_addc_u32 s8, 0, s5
-; GFX9-NEXT:    s_add_u32 s9, s12, s4
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s8, s10, s8
+; GFX9-NEXT:    s_mul_i32 s10, s8, s10
+; GFX9-NEXT:    s_add_u32 s4, s4, s10
+; GFX9-NEXT:    s_addc_u32 s5, 0, s5
+; GFX9-NEXT:    s_add_u32 s9, s9, s4
+; GFX9-NEXT:    s_addc_u32 s8, s8, s5
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX9-NEXT:    s_add_u32 s2, s2, s4
@@ -9309,11 +9267,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    s_mul_i32 s8, s6, s8
 ; GFX9-NEXT:    s_sub_u32 s2, s2, s8
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
 ; GFX9-NEXT:    s_subb_u32 s13, s10, s7
 ; GFX9-NEXT:    s_sub_u32 s14, s2, s6
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
 ; GFX9-NEXT:    s_subb_u32 s15, s13, 0
 ; GFX9-NEXT:    s_cmp_ge_u32 s15, s7
 ; GFX9-NEXT:    s_cselect_b32 s16, -1, 0
@@ -9322,13 +9278,11 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    s_cmp_eq_u32 s15, s7
 ; GFX9-NEXT:    s_cselect_b32 s16, s17, s16
 ; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT:    s_subb_u32 s13, s13, s7
-; GFX9-NEXT:    s_sub_u32 s17, s14, s6
-; GFX9-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT:    s_subb_u32 s10, s13, 0
+; GFX9-NEXT:    s_subb_u32 s10, s13, s7
+; GFX9-NEXT:    s_sub_u32 s11, s14, s6
+; GFX9-NEXT:    s_subb_u32 s10, s10, 0
 ; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX9-NEXT:    s_cselect_b32 s11, s17, s14
+; GFX9-NEXT:    s_cselect_b32 s11, s11, s14
 ; GFX9-NEXT:    s_cselect_b32 s10, s10, s15
 ; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
 ; GFX9-NEXT:    s_subb_u32 s3, s3, s12
@@ -9490,10 +9444,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_addc_u32 s15, 0, s16
 ; GFX6-NEXT:    s_add_u32 s16, s6, s7
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s16
-; GFX6-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s12, v0
+; GFX6-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX6-NEXT:    s_or_b32 s6, s6, s7
-; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX6-NEXT:    s_addc_u32 s14, s14, s15
 ; GFX6-NEXT:    s_mul_i32 s6, s12, s14
 ; GFX6-NEXT:    v_readfirstlane_b32 s7, v0
@@ -9524,7 +9477,6 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_add_u32 s13, s16, s6
 ; GFX6-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX6-NEXT:    s_or_b32 s6, s6, s7
-; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX6-NEXT:    s_addc_u32 s12, s14, s12
 ; GFX6-NEXT:    s_ashr_i32 s6, s9, 31
 ; GFX6-NEXT:    s_add_u32 s8, s8, s6
@@ -9557,49 +9509,46 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_readfirstlane_b32 s14, v0
 ; GFX6-NEXT:    s_add_i32 s13, s14, s13
 ; GFX6-NEXT:    s_mul_i32 s14, s3, s12
-; GFX6-NEXT:    s_add_i32 s14, s13, s14
-; GFX6-NEXT:    s_sub_i32 s15, s9, s14
+; GFX6-NEXT:    s_add_i32 s16, s13, s14
+; GFX6-NEXT:    s_sub_i32 s14, s9, s16
 ; GFX6-NEXT:    s_mul_i32 s12, s2, s12
 ; GFX6-NEXT:    s_sub_u32 s8, s8, s12
 ; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT:    s_or_b32 s16, s12, s13
-; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX6-NEXT:    s_subb_u32 s15, s15, s3
-; GFX6-NEXT:    s_sub_u32 s17, s8, s2
-; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_subb_u32 s18, s15, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s18, s3
-; GFX6-NEXT:    s_cselect_b32 s13, -1, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s17, s2
-; GFX6-NEXT:    s_cselect_b32 s19, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s18, s3
-; GFX6-NEXT:    s_cselect_b32 s19, s19, s13
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_subb_u32 s15, s15, s3
-; GFX6-NEXT:    s_sub_u32 s20, s17, s2
-; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT:    s_or_b32 s15, s12, s13
+; GFX6-NEXT:    s_subb_u32 s17, s14, s3
+; GFX6-NEXT:    s_sub_u32 s18, s8, s2
+; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT:    s_or_b32 s19, s14, s15
+; GFX6-NEXT:    s_subb_u32 s19, s17, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s19, s3
+; GFX6-NEXT:    s_cselect_b32 s20, -1, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s18, s2
+; GFX6-NEXT:    s_cselect_b32 s21, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s19, s3
+; GFX6-NEXT:    s_cselect_b32 s20, s21, s20
+; GFX6-NEXT:    s_or_b32 s14, s14, s15
+; GFX6-NEXT:    s_subb_u32 s17, s17, s3
+; GFX6-NEXT:    s_sub_u32 s21, s18, s2
+; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT:    s_or_b32 s14, s14, s15
+; GFX6-NEXT:    s_subb_u32 s14, s17, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX6-NEXT:    s_cselect_b32 s15, s21, s18
+; GFX6-NEXT:    s_cselect_b32 s14, s14, s19
 ; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_subb_u32 s12, s15, 0
-; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX6-NEXT:    s_cselect_b32 s13, s20, s17
-; GFX6-NEXT:    s_cselect_b32 s12, s12, s18
-; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX6-NEXT:    s_subb_u32 s9, s9, s14
+; GFX6-NEXT:    s_subb_u32 s9, s9, s16
 ; GFX6-NEXT:    s_cmp_ge_u32 s9, s3
-; GFX6-NEXT:    s_cselect_b32 s14, -1, 0
+; GFX6-NEXT:    s_cselect_b32 s12, -1, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s8, s2
 ; GFX6-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s9, s3
-; GFX6-NEXT:    s_cselect_b32 s2, s2, s14
+; GFX6-NEXT:    s_cselect_b32 s2, s2, s12
 ; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX6-NEXT:    s_cselect_b32 s3, s12, s9
-; GFX6-NEXT:    s_cselect_b32 s2, s13, s8
+; GFX6-NEXT:    s_cselect_b32 s3, s14, s9
+; GFX6-NEXT:    s_cselect_b32 s2, s15, s8
 ; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[6:7]
-; GFX6-NEXT:    s_sub_u32 s12, s2, s6
-; GFX6-NEXT:    s_subb_u32 s13, s3, s6
+; GFX6-NEXT:    s_sub_u32 s14, s2, s6
+; GFX6-NEXT:    s_subb_u32 s15, s3, s6
 ; GFX6-NEXT:    s_ashr_i32 s2, s1, 31
 ; GFX6-NEXT:    s_add_u32 s0, s0, s2
 ; GFX6-NEXT:    s_mov_b32 s3, s2
@@ -9618,40 +9567,39 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v2, s8, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
 ; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX6-NEXT:    s_mul_i32 s1, s8, s14
+; GFX6-NEXT:    s_mul_i32 s1, s8, s12
 ; GFX6-NEXT:    v_readfirstlane_b32 s3, v2
 ; GFX6-NEXT:    s_mul_i32 s0, s9, s2
 ; GFX6-NEXT:    s_add_i32 s1, s3, s1
 ; GFX6-NEXT:    s_add_i32 s3, s1, s0
-; GFX6-NEXT:    s_mul_i32 s15, s8, s2
+; GFX6-NEXT:    s_mul_i32 s13, s8, s2
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s3
-; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s15
+; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s13
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; GFX6-NEXT:    s_mul_i32 s4, s2, s3
 ; GFX6-NEXT:    v_readfirstlane_b32 s5, v2
 ; GFX6-NEXT:    v_readfirstlane_b32 s16, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, v1, s15
+; GFX6-NEXT:    v_mul_hi_u32 v0, v1, s13
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s3
 ; GFX6-NEXT:    s_add_u32 s4, s16, s4
 ; GFX6-NEXT:    s_addc_u32 s5, 0, s5
-; GFX6-NEXT:    s_mul_i32 s15, s14, s15
+; GFX6-NEXT:    s_mul_i32 s13, s12, s13
 ; GFX6-NEXT:    v_readfirstlane_b32 s16, v0
-; GFX6-NEXT:    s_add_u32 s4, s4, s15
+; GFX6-NEXT:    s_add_u32 s4, s4, s13
 ; GFX6-NEXT:    s_addc_u32 s4, s5, s16
 ; GFX6-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX6-NEXT:    s_addc_u32 s5, s5, 0
-; GFX6-NEXT:    s_mul_i32 s3, s14, s3
+; GFX6-NEXT:    s_mul_i32 s3, s12, s3
 ; GFX6-NEXT:    s_add_u32 s3, s4, s3
 ; GFX6-NEXT:    s_addc_u32 s4, 0, s5
 ; GFX6-NEXT:    s_add_u32 s5, s2, s3
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s5
-; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
+; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX6-NEXT:    s_addc_u32 s4, s14, s4
+; GFX6-NEXT:    s_addc_u32 s4, s12, s4
 ; GFX6-NEXT:    s_mul_i32 s2, s8, s4
 ; GFX6-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX6-NEXT:    s_add_i32 s2, s3, s2
@@ -9665,102 +9613,98 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s4, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
 ; GFX6-NEXT:    s_mul_i32 s9, s5, s2
-; GFX6-NEXT:    v_readfirstlane_b32 s15, v2
-; GFX6-NEXT:    s_add_u32 s9, s15, s9
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s13, v2
+; GFX6-NEXT:    s_add_u32 s9, s13, s9
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v0
 ; GFX6-NEXT:    s_mul_i32 s3, s4, s3
-; GFX6-NEXT:    s_addc_u32 s14, 0, s14
+; GFX6-NEXT:    s_addc_u32 s12, 0, s12
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v3
 ; GFX6-NEXT:    s_add_u32 s3, s9, s3
-; GFX6-NEXT:    s_addc_u32 s3, s14, s8
+; GFX6-NEXT:    s_addc_u32 s3, s12, s8
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v1
 ; GFX6-NEXT:    s_addc_u32 s8, s8, 0
 ; GFX6-NEXT:    s_mul_i32 s2, s4, s2
 ; GFX6-NEXT:    s_add_u32 s2, s3, s2
 ; GFX6-NEXT:    s_addc_u32 s8, 0, s8
-; GFX6-NEXT:    s_add_u32 s14, s5, s2
+; GFX6-NEXT:    s_add_u32 s12, s5, s2
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX6-NEXT:    s_addc_u32 s15, s4, s8
+; GFX6-NEXT:    s_addc_u32 s13, s4, s8
 ; GFX6-NEXT:    s_ashr_i32 s4, s11, 31
 ; GFX6-NEXT:    s_add_u32 s2, s10, s4
 ; GFX6-NEXT:    s_mov_b32 s5, s4
 ; GFX6-NEXT:    s_addc_u32 s3, s11, s4
 ; GFX6-NEXT:    s_xor_b64 s[8:9], s[2:3], s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v0, s15
+; GFX6-NEXT:    v_mov_b32_e32 v0, s13
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s8, v0
-; GFX6-NEXT:    v_mov_b32_e32 v2, s14
+; GFX6-NEXT:    v_mov_b32_e32 v2, s12
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s8, v2
-; GFX6-NEXT:    s_mul_i32 s2, s8, s15
+; GFX6-NEXT:    s_mul_i32 s2, s8, s13
 ; GFX6-NEXT:    v_readfirstlane_b32 s10, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s9, v2
 ; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s9, v0
 ; GFX6-NEXT:    s_add_u32 s2, s11, s2
 ; GFX6-NEXT:    s_addc_u32 s10, 0, s10
-; GFX6-NEXT:    s_mul_i32 s11, s9, s14
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
+; GFX6-NEXT:    s_mul_i32 s11, s9, s12
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
 ; GFX6-NEXT:    s_add_u32 s2, s2, s11
-; GFX6-NEXT:    s_addc_u32 s2, s10, s14
+; GFX6-NEXT:    s_addc_u32 s2, s10, s12
 ; GFX6-NEXT:    v_readfirstlane_b32 s10, v0
 ; GFX6-NEXT:    s_addc_u32 s10, s10, 0
-; GFX6-NEXT:    s_mul_i32 s11, s9, s15
+; GFX6-NEXT:    s_mul_i32 s11, s9, s13
 ; GFX6-NEXT:    s_add_u32 s11, s2, s11
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s11
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
 ; GFX6-NEXT:    s_addc_u32 s10, 0, s10
 ; GFX6-NEXT:    s_mul_i32 s10, s6, s10
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v0
-; GFX6-NEXT:    s_add_i32 s10, s14, s10
-; GFX6-NEXT:    s_mul_i32 s14, s7, s11
-; GFX6-NEXT:    s_add_i32 s14, s10, s14
-; GFX6-NEXT:    s_sub_i32 s15, s9, s14
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v0
+; GFX6-NEXT:    s_add_i32 s10, s12, s10
+; GFX6-NEXT:    s_mul_i32 s12, s7, s11
+; GFX6-NEXT:    s_add_i32 s16, s10, s12
+; GFX6-NEXT:    s_sub_i32 s12, s9, s16
 ; GFX6-NEXT:    s_mul_i32 s10, s6, s11
 ; GFX6-NEXT:    s_sub_u32 s8, s8, s10
 ; GFX6-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX6-NEXT:    s_or_b32 s16, s10, s11
-; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX6-NEXT:    s_subb_u32 s15, s15, s7
-; GFX6-NEXT:    s_sub_u32 s17, s8, s6
-; GFX6-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX6-NEXT:    s_or_b32 s10, s10, s11
-; GFX6-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX6-NEXT:    s_subb_u32 s18, s15, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s18, s7
-; GFX6-NEXT:    s_cselect_b32 s11, -1, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s17, s6
-; GFX6-NEXT:    s_cselect_b32 s19, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s18, s7
-; GFX6-NEXT:    s_cselect_b32 s19, s19, s11
-; GFX6-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX6-NEXT:    s_subb_u32 s15, s15, s7
-; GFX6-NEXT:    s_sub_u32 s20, s17, s6
-; GFX6-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GFX6-NEXT:    s_or_b32 s13, s10, s11
+; GFX6-NEXT:    s_subb_u32 s17, s12, s7
+; GFX6-NEXT:    s_sub_u32 s18, s8, s6
+; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT:    s_or_b32 s19, s12, s13
+; GFX6-NEXT:    s_subb_u32 s19, s17, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s19, s7
+; GFX6-NEXT:    s_cselect_b32 s20, -1, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s18, s6
+; GFX6-NEXT:    s_cselect_b32 s21, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s19, s7
+; GFX6-NEXT:    s_cselect_b32 s20, s21, s20
+; GFX6-NEXT:    s_or_b32 s12, s12, s13
+; GFX6-NEXT:    s_subb_u32 s17, s17, s7
+; GFX6-NEXT:    s_sub_u32 s21, s18, s6
+; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT:    s_or_b32 s12, s12, s13
+; GFX6-NEXT:    s_subb_u32 s12, s17, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX6-NEXT:    s_cselect_b32 s13, s21, s18
+; GFX6-NEXT:    s_cselect_b32 s12, s12, s19
 ; GFX6-NEXT:    s_or_b32 s10, s10, s11
-; GFX6-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX6-NEXT:    s_subb_u32 s10, s15, 0
-; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX6-NEXT:    s_cselect_b32 s11, s20, s17
-; GFX6-NEXT:    s_cselect_b32 s10, s10, s18
-; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX6-NEXT:    s_subb_u32 s9, s9, s14
+; GFX6-NEXT:    s_subb_u32 s9, s9, s16
 ; GFX6-NEXT:    s_cmp_ge_u32 s9, s7
-; GFX6-NEXT:    s_cselect_b32 s14, -1, 0
+; GFX6-NEXT:    s_cselect_b32 s10, -1, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s8, s6
 ; GFX6-NEXT:    s_cselect_b32 s6, -1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s9, s7
-; GFX6-NEXT:    s_cselect_b32 s6, s6, s14
+; GFX6-NEXT:    s_cselect_b32 s6, s6, s10
 ; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX6-NEXT:    s_cselect_b32 s7, s10, s9
-; GFX6-NEXT:    s_cselect_b32 s6, s11, s8
+; GFX6-NEXT:    s_cselect_b32 s7, s12, s9
+; GFX6-NEXT:    s_cselect_b32 s6, s13, s8
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[6:7], s[4:5]
 ; GFX6-NEXT:    s_sub_u32 s5, s6, s4
 ; GFX6-NEXT:    s_subb_u32 s4, s7, s4
 ; GFX6-NEXT:    s_mov_b32 s2, -1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s12
-; GFX6-NEXT:    v_mov_b32_e32 v1, s13
+; GFX6-NEXT:    v_mov_b32_e32 v0, s14
+; GFX6-NEXT:    v_mov_b32_e32 v1, s15
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9780,8 +9724,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[6:7]
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s3
-; GFX9-NEXT:    s_sub_u32 s12, 0, s2
-; GFX9-NEXT:    s_subb_u32 s13, 0, s3
+; GFX9-NEXT:    s_sub_u32 s6, 0, s2
+; GFX9-NEXT:    s_subb_u32 s7, 0, s3
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -9790,56 +9734,52 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s14, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX9-NEXT:    s_mul_i32 s7, s12, s14
-; GFX9-NEXT:    s_mul_hi_u32 s16, s12, s6
-; GFX9-NEXT:    s_mul_i32 s15, s13, s6
-; GFX9-NEXT:    s_add_i32 s7, s16, s7
-; GFX9-NEXT:    s_mul_i32 s17, s12, s6
-; GFX9-NEXT:    s_add_i32 s7, s7, s15
-; GFX9-NEXT:    s_mul_hi_u32 s16, s6, s17
-; GFX9-NEXT:    s_mul_i32 s18, s6, s7
-; GFX9-NEXT:    s_mul_hi_u32 s15, s6, s7
+; GFX9-NEXT:    v_readfirstlane_b32 s12, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s13, v0
+; GFX9-NEXT:    s_mul_i32 s14, s6, s12
+; GFX9-NEXT:    s_mul_hi_u32 s16, s6, s13
+; GFX9-NEXT:    s_mul_i32 s15, s7, s13
+; GFX9-NEXT:    s_add_i32 s14, s16, s14
+; GFX9-NEXT:    s_mul_i32 s17, s6, s13
+; GFX9-NEXT:    s_add_i32 s14, s14, s15
+; GFX9-NEXT:    s_mul_hi_u32 s16, s13, s17
+; GFX9-NEXT:    s_mul_i32 s18, s13, s14
+; GFX9-NEXT:    s_mul_hi_u32 s15, s13, s14
 ; GFX9-NEXT:    s_add_u32 s16, s16, s18
 ; GFX9-NEXT:    s_addc_u32 s15, 0, s15
-; GFX9-NEXT:    s_mul_hi_u32 s18, s14, s17
-; GFX9-NEXT:    s_mul_i32 s17, s14, s17
+; GFX9-NEXT:    s_mul_hi_u32 s18, s12, s17
+; GFX9-NEXT:    s_mul_i32 s17, s12, s17
 ; GFX9-NEXT:    s_add_u32 s16, s16, s17
-; GFX9-NEXT:    s_mul_hi_u32 s19, s14, s7
+; GFX9-NEXT:    s_mul_hi_u32 s19, s12, s14
 ; GFX9-NEXT:    s_addc_u32 s15, s15, s18
 ; GFX9-NEXT:    s_addc_u32 s16, s19, 0
-; GFX9-NEXT:    s_mul_i32 s7, s14, s7
-; GFX9-NEXT:    s_add_u32 s7, s15, s7
+; GFX9-NEXT:    s_mul_i32 s14, s12, s14
+; GFX9-NEXT:    s_add_u32 s14, s15, s14
 ; GFX9-NEXT:    s_addc_u32 s15, 0, s16
-; GFX9-NEXT:    s_add_u32 s16, s6, s7
-; GFX9-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9-NEXT:    s_addc_u32 s14, s14, s15
-; GFX9-NEXT:    s_mul_i32 s6, s12, s14
-; GFX9-NEXT:    s_mul_hi_u32 s7, s12, s16
-; GFX9-NEXT:    s_add_i32 s6, s7, s6
-; GFX9-NEXT:    s_mul_i32 s13, s13, s16
-; GFX9-NEXT:    s_add_i32 s6, s6, s13
-; GFX9-NEXT:    s_mul_i32 s12, s12, s16
-; GFX9-NEXT:    s_mul_hi_u32 s13, s14, s12
-; GFX9-NEXT:    s_mul_i32 s15, s14, s12
-; GFX9-NEXT:    s_mul_i32 s18, s16, s6
-; GFX9-NEXT:    s_mul_hi_u32 s12, s16, s12
-; GFX9-NEXT:    s_mul_hi_u32 s17, s16, s6
-; GFX9-NEXT:    s_add_u32 s12, s12, s18
+; GFX9-NEXT:    s_add_u32 s13, s13, s14
+; GFX9-NEXT:    s_addc_u32 s12, s12, s15
+; GFX9-NEXT:    s_mul_i32 s14, s6, s12
+; GFX9-NEXT:    s_mul_hi_u32 s15, s6, s13
+; GFX9-NEXT:    s_add_i32 s14, s15, s14
+; GFX9-NEXT:    s_mul_i32 s7, s7, s13
+; GFX9-NEXT:    s_add_i32 s14, s14, s7
+; GFX9-NEXT:    s_mul_i32 s6, s6, s13
+; GFX9-NEXT:    s_mul_hi_u32 s15, s12, s6
+; GFX9-NEXT:    s_mul_i32 s16, s12, s6
+; GFX9-NEXT:    s_mul_i32 s18, s13, s14
+; GFX9-NEXT:    s_mul_hi_u32 s6, s13, s6
+; GFX9-NEXT:    s_mul_hi_u32 s17, s13, s14
+; GFX9-NEXT:    s_add_u32 s6, s6, s18
 ; GFX9-NEXT:    s_addc_u32 s17, 0, s17
-; GFX9-NEXT:    s_add_u32 s12, s12, s15
-; GFX9-NEXT:    s_mul_hi_u32 s7, s14, s6
-; GFX9-NEXT:    s_addc_u32 s12, s17, s13
+; GFX9-NEXT:    s_add_u32 s6, s6, s16
+; GFX9-NEXT:    s_mul_hi_u32 s7, s12, s14
+; GFX9-NEXT:    s_addc_u32 s6, s17, s15
 ; GFX9-NEXT:    s_addc_u32 s7, s7, 0
-; GFX9-NEXT:    s_mul_i32 s6, s14, s6
-; GFX9-NEXT:    s_add_u32 s6, s12, s6
-; GFX9-NEXT:    s_addc_u32 s12, 0, s7
-; GFX9-NEXT:    s_add_u32 s13, s16, s6
-; GFX9-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9-NEXT:    s_addc_u32 s12, s14, s12
+; GFX9-NEXT:    s_mul_i32 s14, s12, s14
+; GFX9-NEXT:    s_add_u32 s6, s6, s14
+; GFX9-NEXT:    s_addc_u32 s7, 0, s7
+; GFX9-NEXT:    s_add_u32 s13, s13, s6
+; GFX9-NEXT:    s_addc_u32 s12, s12, s7
 ; GFX9-NEXT:    s_ashr_i32 s6, s9, 31
 ; GFX9-NEXT:    s_add_u32 s8, s8, s6
 ; GFX9-NEXT:    s_mov_b32 s7, s6
@@ -9868,11 +9808,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_mul_i32 s12, s2, s12
 ; GFX9-NEXT:    s_sub_u32 s8, s8, s12
 ; GFX9-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[12:13], 0
 ; GFX9-NEXT:    s_subb_u32 s17, s14, s3
 ; GFX9-NEXT:    s_sub_u32 s18, s8, s2
 ; GFX9-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; GFX9-NEXT:    s_subb_u32 s19, s17, 0
 ; GFX9-NEXT:    s_cmp_ge_u32 s19, s3
 ; GFX9-NEXT:    s_cselect_b32 s20, -1, 0
@@ -9881,13 +9819,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_cmp_eq_u32 s19, s3
 ; GFX9-NEXT:    s_cselect_b32 s20, s21, s20
 ; GFX9-NEXT:    s_cmp_lg_u64 s[14:15], 0
-; GFX9-NEXT:    s_subb_u32 s17, s17, s3
-; GFX9-NEXT:    s_sub_u32 s21, s18, s2
-; GFX9-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[14:15], 0
-; GFX9-NEXT:    s_subb_u32 s14, s17, 0
+; GFX9-NEXT:    s_subb_u32 s14, s17, s3
+; GFX9-NEXT:    s_sub_u32 s15, s18, s2
+; GFX9-NEXT:    s_subb_u32 s14, s14, 0
 ; GFX9-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX9-NEXT:    s_cselect_b32 s15, s21, s18
+; GFX9-NEXT:    s_cselect_b32 s15, s15, s18
 ; GFX9-NEXT:    s_cselect_b32 s14, s14, s19
 ; GFX9-NEXT:    s_cmp_lg_u64 s[12:13], 0
 ; GFX9-NEXT:    s_subb_u32 s9, s9, s16
@@ -9911,8 +9847,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s3
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT:    s_sub_u32 s6, 0, s2
-; GFX9-NEXT:    s_subb_u32 s7, 0, s3
+; GFX9-NEXT:    s_sub_u32 s4, 0, s2
+; GFX9-NEXT:    s_subb_u32 s5, 0, s3
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -9922,74 +9858,70 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX9-NEXT:    v_readfirstlane_b32 s9, v2
-; GFX9-NEXT:    s_mul_hi_u32 s8, s6, s4
-; GFX9-NEXT:    s_mul_i32 s14, s6, s9
-; GFX9-NEXT:    s_mul_i32 s5, s7, s4
+; GFX9-NEXT:    s_mul_hi_u32 s8, s4, s6
+; GFX9-NEXT:    s_mul_i32 s14, s4, s9
+; GFX9-NEXT:    s_mul_i32 s7, s5, s6
 ; GFX9-NEXT:    s_add_i32 s8, s8, s14
-; GFX9-NEXT:    s_add_i32 s8, s8, s5
-; GFX9-NEXT:    s_mul_i32 s15, s6, s4
-; GFX9-NEXT:    s_mul_i32 s14, s4, s8
-; GFX9-NEXT:    s_mul_hi_u32 s16, s4, s15
-; GFX9-NEXT:    s_mul_hi_u32 s5, s4, s8
+; GFX9-NEXT:    s_add_i32 s8, s8, s7
+; GFX9-NEXT:    s_mul_i32 s15, s4, s6
+; GFX9-NEXT:    s_mul_i32 s14, s6, s8
+; GFX9-NEXT:    s_mul_hi_u32 s16, s6, s15
+; GFX9-NEXT:    s_mul_hi_u32 s7, s6, s8
 ; GFX9-NEXT:    s_add_u32 s14, s16, s14
-; GFX9-NEXT:    s_addc_u32 s5, 0, s5
+; GFX9-NEXT:    s_addc_u32 s7, 0, s7
 ; GFX9-NEXT:    s_mul_hi_u32 s17, s9, s15
 ; GFX9-NEXT:    s_mul_i32 s15, s9, s15
 ; GFX9-NEXT:    s_add_u32 s14, s14, s15
 ; GFX9-NEXT:    s_mul_hi_u32 s16, s9, s8
-; GFX9-NEXT:    s_addc_u32 s5, s5, s17
+; GFX9-NEXT:    s_addc_u32 s7, s7, s17
 ; GFX9-NEXT:    s_addc_u32 s14, s16, 0
 ; GFX9-NEXT:    s_mul_i32 s8, s9, s8
-; GFX9-NEXT:    s_add_u32 s5, s5, s8
+; GFX9-NEXT:    s_add_u32 s7, s7, s8
 ; GFX9-NEXT:    s_addc_u32 s8, 0, s14
-; GFX9-NEXT:    s_add_u32 s14, s4, s5
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s8, s9, s8
-; GFX9-NEXT:    s_mul_i32 s4, s6, s8
-; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s14
-; GFX9-NEXT:    s_add_i32 s4, s5, s4
-; GFX9-NEXT:    s_mul_i32 s7, s7, s14
-; GFX9-NEXT:    s_add_i32 s4, s4, s7
-; GFX9-NEXT:    s_mul_i32 s6, s6, s14
-; GFX9-NEXT:    s_mul_hi_u32 s7, s8, s6
-; GFX9-NEXT:    s_mul_i32 s9, s8, s6
-; GFX9-NEXT:    s_mul_i32 s16, s14, s4
-; GFX9-NEXT:    s_mul_hi_u32 s6, s14, s6
-; GFX9-NEXT:    s_mul_hi_u32 s15, s14, s4
-; GFX9-NEXT:    s_add_u32 s6, s6, s16
+; GFX9-NEXT:    s_add_u32 s6, s6, s7
+; GFX9-NEXT:    s_addc_u32 s7, s9, s8
+; GFX9-NEXT:    s_mul_i32 s8, s4, s7
+; GFX9-NEXT:    s_mul_hi_u32 s9, s4, s6
+; GFX9-NEXT:    s_add_i32 s8, s9, s8
+; GFX9-NEXT:    s_mul_i32 s5, s5, s6
+; GFX9-NEXT:    s_add_i32 s8, s8, s5
+; GFX9-NEXT:    s_mul_i32 s4, s4, s6
+; GFX9-NEXT:    s_mul_hi_u32 s9, s7, s4
+; GFX9-NEXT:    s_mul_i32 s14, s7, s4
+; GFX9-NEXT:    s_mul_i32 s16, s6, s8
+; GFX9-NEXT:    s_mul_hi_u32 s4, s6, s4
+; GFX9-NEXT:    s_mul_hi_u32 s15, s6, s8
+; GFX9-NEXT:    s_add_u32 s4, s4, s16
 ; GFX9-NEXT:    s_addc_u32 s15, 0, s15
-; GFX9-NEXT:    s_add_u32 s6, s6, s9
-; GFX9-NEXT:    s_mul_hi_u32 s5, s8, s4
-; GFX9-NEXT:    s_addc_u32 s6, s15, s7
+; GFX9-NEXT:    s_add_u32 s4, s4, s14
+; GFX9-NEXT:    s_mul_hi_u32 s5, s7, s8
+; GFX9-NEXT:    s_addc_u32 s4, s15, s9
 ; GFX9-NEXT:    s_addc_u32 s5, s5, 0
-; GFX9-NEXT:    s_mul_i32 s4, s8, s4
-; GFX9-NEXT:    s_add_u32 s4, s6, s4
-; GFX9-NEXT:    s_addc_u32 s6, 0, s5
-; GFX9-NEXT:    s_add_u32 s9, s14, s4
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s8, s8, s6
+; GFX9-NEXT:    s_mul_i32 s8, s7, s8
+; GFX9-NEXT:    s_add_u32 s4, s4, s8
+; GFX9-NEXT:    s_addc_u32 s5, 0, s5
+; GFX9-NEXT:    s_add_u32 s8, s6, s4
+; GFX9-NEXT:    s_addc_u32 s9, s7, s5
 ; GFX9-NEXT:    s_ashr_i32 s4, s11, 31
 ; GFX9-NEXT:    s_add_u32 s6, s10, s4
 ; GFX9-NEXT:    s_mov_b32 s5, s4
 ; GFX9-NEXT:    s_addc_u32 s7, s11, s4
 ; GFX9-NEXT:    s_xor_b64 s[6:7], s[6:7], s[4:5]
-; GFX9-NEXT:    s_mul_i32 s11, s6, s8
-; GFX9-NEXT:    s_mul_hi_u32 s14, s6, s9
-; GFX9-NEXT:    s_mul_hi_u32 s10, s6, s8
+; GFX9-NEXT:    s_mul_i32 s11, s6, s9
+; GFX9-NEXT:    s_mul_hi_u32 s14, s6, s8
+; GFX9-NEXT:    s_mul_hi_u32 s10, s6, s9
 ; GFX9-NEXT:    s_add_u32 s11, s14, s11
 ; GFX9-NEXT:    s_addc_u32 s10, 0, s10
-; GFX9-NEXT:    s_mul_hi_u32 s15, s7, s9
-; GFX9-NEXT:    s_mul_i32 s9, s7, s9
-; GFX9-NEXT:    s_add_u32 s9, s11, s9
-; GFX9-NEXT:    s_mul_hi_u32 s14, s7, s8
-; GFX9-NEXT:    s_addc_u32 s9, s10, s15
-; GFX9-NEXT:    s_addc_u32 s10, s14, 0
+; GFX9-NEXT:    s_mul_hi_u32 s15, s7, s8
 ; GFX9-NEXT:    s_mul_i32 s8, s7, s8
-; GFX9-NEXT:    s_add_u32 s8, s9, s8
+; GFX9-NEXT:    s_add_u32 s8, s11, s8
+; GFX9-NEXT:    s_mul_hi_u32 s14, s7, s9
+; GFX9-NEXT:    s_addc_u32 s8, s10, s15
+; GFX9-NEXT:    s_addc_u32 s10, s14, 0
+; GFX9-NEXT:    s_mul_i32 s9, s7, s9
+; GFX9-NEXT:    s_add_u32 s8, s8, s9
 ; GFX9-NEXT:    s_addc_u32 s9, 0, s10
 ; GFX9-NEXT:    s_mul_i32 s9, s2, s9
 ; GFX9-NEXT:    s_mul_hi_u32 s10, s2, s8
@@ -10000,11 +9932,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_mul_i32 s8, s2, s8
 ; GFX9-NEXT:    s_sub_u32 s6, s6, s8
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
 ; GFX9-NEXT:    s_subb_u32 s15, s10, s3
 ; GFX9-NEXT:    s_sub_u32 s16, s6, s2
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
 ; GFX9-NEXT:    s_subb_u32 s17, s15, 0
 ; GFX9-NEXT:    s_cmp_ge_u32 s17, s3
 ; GFX9-NEXT:    s_cselect_b32 s18, -1, 0
@@ -10013,13 +9943,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_cmp_eq_u32 s17, s3
 ; GFX9-NEXT:    s_cselect_b32 s18, s19, s18
 ; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT:    s_subb_u32 s15, s15, s3
-; GFX9-NEXT:    s_sub_u32 s19, s16, s2
-; GFX9-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT:    s_subb_u32 s10, s15, 0
+; GFX9-NEXT:    s_subb_u32 s10, s15, s3
+; GFX9-NEXT:    s_sub_u32 s11, s16, s2
+; GFX9-NEXT:    s_subb_u32 s10, s10, 0
 ; GFX9-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX9-NEXT:    s_cselect_b32 s11, s19, s16
+; GFX9-NEXT:    s_cselect_b32 s11, s11, s16
 ; GFX9-NEXT:    s_cselect_b32 s10, s10, s17
 ; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
 ; GFX9-NEXT:    s_subb_u32 s7, s7, s14
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index 394727c..01f4414 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -612,12 +612,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8-NEXT:    s_mov_b32 m0, s3
-; GFX8-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8-NEXT:    s_add_i32 s2, s2, s8
+; GFX8-NEXT:    s_add_i32 s2, s2, s6
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -653,12 +652,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9-NEXT:    s_mov_b32 m0, s3
-; GFX9-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9-NEXT:    s_add_i32 s2, s2, s8
+; GFX9-NEXT:    s_add_i32 s2, s2, s6
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -693,11 +691,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W64-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX10W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX10W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -733,11 +730,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -774,11 +770,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX11W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -818,11 +813,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -859,11 +853,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX12W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -901,15 +894,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX12W32-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
-; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX12W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -999,12 +992,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8-NEXT:    s_mov_b32 m0, s3
-; GFX8-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8-NEXT:    s_add_i32 s2, s2, s8
+; GFX8-NEXT:    s_add_i32 s2, s2, s6
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1042,12 +1034,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9-NEXT:    s_mov_b32 m0, s3
-; GFX9-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9-NEXT:    s_add_i32 s2, s2, s8
+; GFX9-NEXT:    s_add_i32 s2, s2, s6
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1084,11 +1075,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W64-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX10W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX10W64-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1127,11 +1117,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1171,11 +1160,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX11W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX11W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1218,11 +1206,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1261,11 +1248,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX12W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX12W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1306,15 +1292,15 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX12W32-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
-; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX12W32-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2073,12 +2059,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8-NEXT:    s_mov_b32 m0, s3
-; GFX8-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8-NEXT:    s_add_i32 s2, s2, s8
+; GFX8-NEXT:    s_add_i32 s2, s2, s6
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2114,12 +2099,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9-NEXT:    s_mov_b32 m0, s3
-; GFX9-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9-NEXT:    s_add_i32 s2, s2, s8
+; GFX9-NEXT:    s_add_i32 s2, s2, s6
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2154,11 +2138,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W64-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX10W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX10W64-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2194,11 +2177,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2235,11 +2217,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX11W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2279,11 +2260,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2321,11 +2301,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX12W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2363,15 +2342,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX12W32-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
-; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX12W32-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 258bc295..9db6d70 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -717,12 +717,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s2
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s2
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, m0
-; GFX8_ITERATIVE-NEXT:    s_add_i32 s6, s6, s7
+; GFX8_ITERATIVE-NEXT:    s_add_i32 s6, s6, s3
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -762,12 +761,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s2
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s2
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, m0
-; GFX9_ITERATIVE-NEXT:    s_add_i32 s6, s6, s7
+; GFX9_ITERATIVE-NEXT:    s_add_i32 s6, s6, s3
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -805,13 +803,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s7, s[0:1]
-; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s7
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s7
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s7
+; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s2
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s2
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1064_ITERATIVE-NEXT:    s_add_i32 s6, s6, s7
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064_ITERATIVE-NEXT:    s_add_i32 s6, s6, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -853,11 +850,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s1
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s1
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s3
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032_ITERATIVE-NEXT:    s_add_i32 s6, s6, s2
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -897,14 +893,13 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
 ; GFX1164_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s7, s[0:1]
+; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s7
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s7
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s7
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v1, s2
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s2
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1164_ITERATIVE-NEXT:    s_add_i32 s6, s6, s7
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164_ITERATIVE-NEXT:    s_add_i32 s6, s6, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -949,11 +944,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s2, v1, s1
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s1
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s3
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1132_ITERATIVE-NEXT:    s_add_i32 s6, s6, s2
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -993,14 +987,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
 ; GFX1264_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX1264_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s7, s[0:1]
+; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s2, s[0:1]
+; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
+; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s7, v1, s2
+; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s2
+; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1264_ITERATIVE-NEXT:    s_add_co_i32 s6, s6, s7
 ; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s7
-; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s7
-; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s7
 ; GFX1264_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1264_ITERATIVE-NEXT:    s_add_co_i32 s6, s6, s8
-; GFX1264_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1264_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1264_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1264_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -1028,6 +1022,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xf1ff
 ; GFX1264_ITERATIVE-NEXT:    v_add_nc_u32_e32 v0, s2, v0
 ; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1264_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], null
@@ -1041,15 +1036,15 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
 ; GFX1232_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX1232_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s2, v1, s1
-; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s3
+; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    s_add_co_i32 s6, s6, s2
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1232_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1232_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1232_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -2363,7 +2358,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s3
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2416,7 +2410,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s3
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2462,13 +2455,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s2
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s2
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s7, s2
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s2
 ; GFX1064_ITERATIVE-NEXT:    s_add_u32 s6, s6, s3
 ; GFX1064_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s8
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2515,13 +2507,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s1
-; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s1
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s7, s1
+; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s1
 ; GFX1032_ITERATIVE-NEXT:    s_add_u32 s6, s6, s2
 ; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s3
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2569,14 +2560,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s3, v2, s2
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s2
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s2
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s2
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s2
 ; GFX1164_ITERATIVE-NEXT:    s_add_u32 s6, s6, s3
 ; GFX1164_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s8
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -2626,14 +2616,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s1
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s1
-; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s1
+; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
 ; GFX1132_ITERATIVE-NEXT:    s_add_u32 s6, s6, s2
 ; GFX1132_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s3
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -2677,16 +2666,16 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1264_ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1264_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s10, s[0:1]
+; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s8, s[0:1]
+; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
+; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s8
+; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s8
+; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s8
+; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s8
+; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s10
-; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s10
-; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
-; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s10
-; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s10
 ; GFX1264_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[8:9]
 ; GFX1264_ITERATIVE-NEXT:    s_add_nc_u64 s[6:7], s[6:7], s[2:3]
-; GFX1264_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1264_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1264_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1264_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -2731,17 +2720,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1232_ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1232_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s1
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s1
-; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s8
-; GFX1232_ITERATIVE-NEXT:    s_add_nc_u64 s[6:7], s[6:7], s[2:3]
+; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s1
+; GFX1232_ITERATIVE-NEXT:    s_add_nc_u64 s[6:7], s[6:7], s[2:3]
 ; GFX1232_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1232_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1232_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -4490,12 +4479,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s2
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s2
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v2, s12, m0
-; GFX8_ITERATIVE-NEXT:    s_add_i32 s12, s12, s6
+; GFX8_ITERATIVE-NEXT:    s_add_i32 s12, s12, s3
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -4550,12 +4538,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s2
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s2
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s12, m0
-; GFX9_ITERATIVE-NEXT:    s_add_i32 s12, s12, s6
+; GFX9_ITERATIVE-NEXT:    s_add_i32 s12, s12, s3
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -4608,13 +4595,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr2
 ; GFX1064_ITERATIVE-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[0:1]
-; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s6
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s6
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s12, s6
+; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s2
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s12, s2
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1064_ITERATIVE-NEXT:    s_add_i32 s12, s12, s6
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064_ITERATIVE-NEXT:    s_add_i32 s12, s12, s7
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -4670,11 +4656,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s1
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s8, s1
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s3
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032_ITERATIVE-NEXT:    s_add_i32 s8, s8, s2
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -4728,14 +4713,13 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr2
 ; GFX1164_ITERATIVE-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s6, s[0:1]
+; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s6
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s6
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v2, s12, s6
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s2
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v2, s12, s2
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1164_ITERATIVE-NEXT:    s_add_i32 s12, s12, s6
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164_ITERATIVE-NEXT:    s_add_i32 s12, s12, s7
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -4799,11 +4783,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s1
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v2, s8, s1
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s3
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1132_ITERATIVE-NEXT:    s_add_i32 s8, s8, s2
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -4861,14 +4844,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
 ; GFX1264_ITERATIVE-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1264_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s7, s[0:1]
+; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s2, s[0:1]
+; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
+; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s7, v1, s2
+; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s2
+; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1264_ITERATIVE-NEXT:    s_add_co_i32 s6, s6, s7
 ; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s7
-; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s7
-; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s7
 ; GFX1264_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1264_ITERATIVE-NEXT:    s_add_co_i32 s6, s6, s8
-; GFX1264_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1264_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1264_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1264_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -4896,6 +4879,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xf1ff
 ; GFX1264_ITERATIVE-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1264_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], null
@@ -4909,15 +4893,15 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
 ; GFX1232_ITERATIVE-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1232_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s2, v1, s1
-; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s3
+; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    s_add_co_i32 s6, s6, s2
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1232_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1232_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1232_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -6673,7 +6657,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8_ITERATIVE-NEXT:    s_addc_u32 s9, s9, s3
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -6746,7 +6729,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_ITERATIVE-NEXT:    s_addc_u32 s9, s9, s3
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -6812,13 +6794,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v1, s2
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v4, s8, s2
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v5, s9, s2
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v4, s8, s2
 ; GFX1064_ITERATIVE-NEXT:    s_add_u32 s8, s8, s3
 ; GFX1064_ITERATIVE-NEXT:    s_addc_u32 s9, s9, s6
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -6883,13 +6864,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s1
-; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v4, s8, s1
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v5, s9, s1
+; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v4, s8, s1
 ; GFX1032_ITERATIVE-NEXT:    s_add_u32 s8, s8, s2
 ; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s9, s9, s3
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -6955,14 +6935,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v1, s2
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v4, s8, s2
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v5, s9, s2
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v4, s8, s2
 ; GFX1164_ITERATIVE-NEXT:    s_add_u32 s8, s8, s3
 ; GFX1164_ITERATIVE-NEXT:    s_addc_u32 s9, s9, s6
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -7036,14 +7015,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s1
-; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v4, s8, s1
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v5, s9, s1
+; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v4, s8, s1
 ; GFX1132_ITERATIVE-NEXT:    s_add_u32 s8, s8, s2
 ; GFX1132_ITERATIVE-NEXT:    s_addc_u32 s9, s9, s3
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -7109,16 +7087,16 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1264_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX1264_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s10, s[0:1]
+; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s8, s[0:1]
+; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
+; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s8
+; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s8
+; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s8
+; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s8
+; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s10
-; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s10
-; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
-; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s10
-; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s10
 ; GFX1264_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[8:9]
 ; GFX1264_ITERATIVE-NEXT:    s_add_nc_u64 s[6:7], s[6:7], s[2:3]
-; GFX1264_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1264_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1264_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1264_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -7163,17 +7141,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1232_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX1232_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s1
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s1
-; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s8
-; GFX1232_ITERATIVE-NEXT:    s_add_nc_u64 s[6:7], s[6:7], s[2:3]
+; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s1
+; GFX1232_ITERATIVE-NEXT:    s_add_nc_u64 s[6:7], s[6:7], s[2:3]
 ; GFX1232_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1232_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1232_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 23c5f4f..6167a84 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -499,12 +499,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -540,12 +539,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -580,11 +578,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -621,11 +618,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -663,11 +659,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -707,11 +702,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1088,11 +1082,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX8_ITERATIVE-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX8_ITERATIVE-NEXT:    s_add_i32 s2, s2, s4
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
-; GFX8_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1117,11 +1110,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX9_ITERATIVE-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX9_ITERATIVE-NEXT:    s_add_i32 s2, s2, s4
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
-; GFX9_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1147,9 +1139,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1064_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1176,9 +1167,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1206,10 +1196,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1164_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1239,10 +1227,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2022,7 +2008,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2071,7 +2056,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2112,13 +2096,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[2:3]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s6
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s6
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s6
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s6
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s6
 ; GFX1064_ITERATIVE-NEXT:    s_add_u32 s0, s0, s7
 ; GFX1064_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s8
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2160,13 +2143,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
-; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
+; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
 ; GFX1032_ITERATIVE-NEXT:    s_add_u32 s0, s0, s6
 ; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2209,14 +2191,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v2, s6
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s6
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s6
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s6
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s6
 ; GFX1164_ITERATIVE-NEXT:    s_add_u32 s0, s0, s7
 ; GFX1164_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s8
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -2261,14 +2242,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
-; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
+; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    s_add_u32 s0, s0, s6
 ; GFX1132_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -2881,7 +2861,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX8_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s4
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2914,7 +2893,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX9_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s4
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2946,7 +2924,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1064_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s6
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s4
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2979,7 +2956,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3013,8 +2989,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1164_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s6
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s4
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[4:5]
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3048,9 +3022,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1132_ITERATIVE-NEXT:    s_add_u32 s0, s0, s4
 ; GFX1132_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3906,12 +3879,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3947,12 +3919,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3987,11 +3958,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4028,11 +3998,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4070,11 +4039,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -4114,11 +4082,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -4495,11 +4462,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX8_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX8_ITERATIVE-NEXT:    s_add_i32 s2, s2, s4
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
-; GFX8_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4524,11 +4490,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX9_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX9_ITERATIVE-NEXT:    s_add_i32 s2, s2, s4
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
-; GFX9_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4554,9 +4519,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1064_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4583,9 +4547,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4613,10 +4576,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1164_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4646,10 +4607,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5452,7 +5411,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5501,7 +5459,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5542,13 +5499,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[2:3]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s6
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s6
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s6
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s6
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s6
 ; GFX1064_ITERATIVE-NEXT:    s_add_u32 s0, s0, s7
 ; GFX1064_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s8
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5590,13 +5546,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
-; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
+; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
 ; GFX1032_ITERATIVE-NEXT:    s_add_u32 s0, s0, s6
 ; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5639,14 +5594,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v2, s6
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s6
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s6
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s6
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s6
 ; GFX1164_ITERATIVE-NEXT:    s_add_u32 s0, s0, s7
 ; GFX1164_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s8
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -5691,14 +5645,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
-; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
+; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    s_add_u32 s0, s0, s6
 ; GFX1132_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -6313,12 +6266,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT:    s_and_b32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT:    s_and_b32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6354,12 +6306,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT:    s_and_b32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT:    s_and_b32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6394,11 +6345,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    s_and_b32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6435,11 +6385,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_and_b32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6477,11 +6426,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    s_and_b32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -6521,11 +6469,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_and_b32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -6926,12 +6873,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s8
 ; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
 ; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
 ; GFX8_ITERATIVE-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB16_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6973,12 +6919,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s8
 ; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
 ; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
 ; GFX9_ITERATIVE-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB16_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7015,15 +6960,14 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX1064_ITERATIVE-NEXT:  .LBB16_1: ; %ComputeLoop
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s10, s[2:3]
-; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
-; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s10
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s10
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s10
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s8
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s8
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX1064_ITERATIVE-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB16_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7065,12 +7009,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s8
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB16_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7109,16 +7052,15 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1164_ITERATIVE-NEXT:  .LBB16_1: ; %ComputeLoop
 ; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s10, s[2:3]
+; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s8, s[2:3]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s10
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s10
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s10
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s8
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s8
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s8
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX1164_ITERATIVE-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[8:9]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB16_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -7163,12 +7105,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s8
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB16_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -7672,12 +7613,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT:    s_or_b32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT:    s_or_b32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7713,12 +7653,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT:    s_or_b32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT:    s_or_b32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7753,11 +7692,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    s_or_b32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7794,11 +7732,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_or_b32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7836,11 +7773,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    s_or_b32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -7880,11 +7816,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_or_b32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -8284,12 +8219,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s8
 ; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
 ; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
 ; GFX8_ITERATIVE-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB18_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8331,12 +8265,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s8
 ; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
 ; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
 ; GFX9_ITERATIVE-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB18_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8373,15 +8306,14 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX1064_ITERATIVE-NEXT:  .LBB18_1: ; %ComputeLoop
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s10, s[2:3]
-; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
-; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s10
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s10
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s10
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s8
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s8
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX1064_ITERATIVE-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB18_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8423,12 +8355,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s8
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB18_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8467,16 +8398,15 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1164_ITERATIVE-NEXT:  .LBB18_1: ; %ComputeLoop
 ; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s10, s[2:3]
+; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s8, s[2:3]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s10
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s10
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s10
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s8
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s8
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s8
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX1164_ITERATIVE-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[8:9]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB18_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8521,12 +8451,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s8
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB18_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9030,12 +8959,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT:    s_xor_b32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT:    s_xor_b32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB19_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9071,12 +8999,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT:    s_xor_b32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT:    s_xor_b32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB19_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9111,11 +9038,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    s_xor_b32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB19_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9152,11 +9078,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB19_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9194,11 +9119,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    s_xor_b32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB19_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -9238,11 +9162,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_xor_b32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB19_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -9642,12 +9565,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s8
 ; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
 ; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
 ; GFX8_ITERATIVE-NEXT:    s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB20_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9689,12 +9611,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s8
 ; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
 ; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
 ; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB20_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9731,15 +9652,14 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX1064_ITERATIVE-NEXT:  .LBB20_1: ; %ComputeLoop
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s10, s[2:3]
-; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
-; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s10
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s10
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s10
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s8
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s8
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB20_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9781,12 +9701,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s8
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB20_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9825,16 +9744,15 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1164_ITERATIVE-NEXT:  .LBB20_1: ; %ComputeLoop
 ; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s10, s[2:3]
+; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s8, s[2:3]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s10
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s10
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s10
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s8
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s8
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s8
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX1164_ITERATIVE-NEXT:    s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[8:9]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB20_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9879,12 +9797,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s8
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB20_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -10388,12 +10305,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT:    s_max_i32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT:    s_max_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB21_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10429,12 +10345,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT:    s_max_i32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT:    s_max_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB21_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10469,11 +10384,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    s_max_i32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB21_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10510,11 +10424,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_max_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB21_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10552,11 +10465,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    s_max_i32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB21_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -10596,11 +10508,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_max_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB21_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -11255,7 +11166,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB23_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11311,7 +11221,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB23_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11363,7 +11272,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB23_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11415,7 +11323,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB23_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11468,9 +11375,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
 ; GFX1164_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB23_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -11525,9 +11431,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
 ; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB23_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -12214,12 +12119,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT:    s_min_i32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT:    s_min_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB24_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12255,12 +12159,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT:    s_min_i32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT:    s_min_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB24_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12295,11 +12198,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    s_min_i32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB24_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12336,11 +12238,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_min_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB24_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12378,11 +12279,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    s_min_i32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB24_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -12422,11 +12322,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_min_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB24_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -13081,7 +12980,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB26_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13137,7 +13035,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB26_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13189,7 +13086,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB26_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13241,7 +13137,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB26_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13294,9 +13189,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
 ; GFX1164_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB26_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -13351,9 +13245,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
 ; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB26_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -14040,12 +13933,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT:    s_max_u32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT:    s_max_u32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB27_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14081,12 +13973,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT:    s_max_u32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT:    s_max_u32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB27_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14121,11 +14012,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    s_max_u32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB27_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14162,11 +14052,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_max_u32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB27_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14204,11 +14093,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    s_max_u32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB27_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -14248,11 +14136,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_max_u32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB27_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -14901,7 +14788,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB29_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14956,7 +14842,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB29_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15007,7 +14892,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB29_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15058,7 +14942,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB29_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15112,8 +14995,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB29_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -15169,8 +15050,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB29_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -15853,12 +15732,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT:    s_min_u32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT:    s_min_u32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB30_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15894,12 +15772,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT:    s_min_u32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT:    s_min_u32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB30_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15934,11 +15811,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    s_min_u32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB30_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15975,11 +15851,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_min_u32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB30_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16017,11 +15892,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    s_min_u32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB30_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -16061,11 +15935,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_min_u32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB30_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -16715,7 +16588,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB32_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16770,7 +16642,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB32_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16821,7 +16692,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB32_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16872,7 +16742,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB32_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16926,8 +16795,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB32_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -16983,8 +16850,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB32_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index e4def28..9afc0c6 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -611,12 +611,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8-NEXT:    s_mov_b32 m0, s3
-; GFX8-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8-NEXT:    s_add_i32 s2, s2, s8
+; GFX8-NEXT:    s_add_i32 s2, s2, s6
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -652,12 +651,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9-NEXT:    s_mov_b32 m0, s3
-; GFX9-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9-NEXT:    s_add_i32 s2, s2, s8
+; GFX9-NEXT:    s_add_i32 s2, s2, s6
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -692,11 +690,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W64-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX10W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX10W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -732,11 +729,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -773,11 +769,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX11W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -817,11 +812,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -858,11 +852,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX12W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -900,15 +893,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX12W32-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
-; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX12W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1665,12 +1658,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8-NEXT:    s_mov_b32 m0, s3
-; GFX8-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8-NEXT:    s_add_i32 s2, s2, s8
+; GFX8-NEXT:    s_add_i32 s2, s2, s6
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1706,12 +1698,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9-NEXT:    s_mov_b32 m0, s3
-; GFX9-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9-NEXT:    s_add_i32 s2, s2, s8
+; GFX9-NEXT:    s_add_i32 s2, s2, s6
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1746,11 +1737,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W64-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX10W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX10W64-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1786,11 +1776,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1827,11 +1816,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX11W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1871,11 +1859,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1913,11 +1900,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX12W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1955,15 +1941,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX12W32-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
-; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX12W32-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index 39a3c9a..10fd34f 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -628,12 +628,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8-NEXT:    s_mov_b32 m0, s3
-; GFX8-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8-NEXT:    s_add_i32 s2, s2, s8
+; GFX8-NEXT:    s_add_i32 s2, s2, s6
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -670,12 +669,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9-NEXT:    s_mov_b32 m0, s3
-; GFX9-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9-NEXT:    s_add_i32 s2, s2, s8
+; GFX9-NEXT:    s_add_i32 s2, s2, s6
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -711,11 +709,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W64-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX10W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX10W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -752,11 +749,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -794,11 +790,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX11W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -839,11 +834,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -880,11 +874,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX12W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -923,15 +916,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX12W32-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
-; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX12W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1833,12 +1826,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8-NEXT:    s_mov_b32 m0, s3
-; GFX8-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8-NEXT:    s_add_i32 s2, s2, s8
+; GFX8-NEXT:    s_add_i32 s2, s2, s6
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1875,12 +1867,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9-NEXT:    s_mov_b32 m0, s3
-; GFX9-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9-NEXT:    s_add_i32 s2, s2, s8
+; GFX9-NEXT:    s_add_i32 s2, s2, s6
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1916,11 +1907,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W64-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX10W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX10W64-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1957,11 +1947,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1999,11 +1988,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX11W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2044,11 +2032,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2086,11 +2073,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX12W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2129,15 +2115,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX12W32-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
-; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX12W32-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 711d57b..30ad46d9 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -39131,21 +39131,21 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_xor_b32_e32 v2, v0, v1
 ; GFX1250-NEXT:    v_cls_i32_e32 v3, v1
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_add_nc_u32 v3, -1, v3 :: v_dual_bitop2_b32 v2, v0, v1 bitop3:0x14
 ; GFX1250-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_add_nc_u32_e32 v2, 32, v2
+; GFX1250-NEXT:    v_min_u32_e32 v2, v3, v2
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_add_min_u32 v2, v3, -1, v2
 ; GFX1250-NEXT:    v_lshlrev_b64_e32 v[0:1], v2, v[0:1]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX1250-NEXT:    v_dual_sub_nc_u32 v1, 32, v2 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_sub_nc_u32 v1, 32, v2 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54
 ; GFX1250-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %op = sitofp i64 %x to bfloat
@@ -39483,29 +39483,30 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX1250-NEXT:    v_xor_b32_e32 v4, v2, v3
 ; GFX1250-NEXT:    v_cls_i32_e32 v6, v3
 ; GFX1250-NEXT:    v_cls_i32_e32 v7, v1
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1250-NEXT:    v_dual_ashrrev_i32 v5, 31, v5 :: v_dual_ashrrev_i32 v4, 31, v4
+; GFX1250-NEXT:    v_dual_add_nc_u32 v6, -1, v6 :: v_dual_add_nc_u32 v7, -1, v7
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_add_nc_u32 v5, 32, v5 :: v_dual_add_nc_u32 v4, 32, v4
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_add_min_u32 v5, v7, -1, v5
-; GFX1250-NEXT:    v_add_min_u32 v4, v6, -1, v4
+; GFX1250-NEXT:    v_min_u32_e32 v5, v7, v5
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_min_u32_e32 v4, v6, v4
 ; GFX1250-NEXT:    v_lshlrev_b64_e32 v[0:1], v5, v[0:1]
-; GFX1250-NEXT:    v_lshlrev_b64_e32 v[2:3], v4, v[2:3]
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_lshlrev_b64_e32 v[2:3], v4, v[2:3]
 ; GFX1250-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX1250-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX1250-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1250-NEXT:    v_dual_sub_nc_u32 v1, 32, v4 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54
 ; GFX1250-NEXT:    v_sub_nc_u32_e32 v3, 32, v5
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1250-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1250-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_ldexp_f32 v1, v2, v1
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %op = sitofp <2 x i64> %x to <2 x bfloat>
@@ -39968,41 +39969,42 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX1250TRUE16:       ; %bb.0:
 ; GFX1250TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1250TRUE16-NEXT:    v_xor_b32_e32 v7, v2, v3
-; GFX1250TRUE16-NEXT:    v_xor_b32_e32 v6, v4, v5
+; GFX1250TRUE16-NEXT:    v_cls_i32_e32 v6, v5
+; GFX1250TRUE16-NEXT:    v_xor_b32_e32 v7, v4, v5
 ; GFX1250TRUE16-NEXT:    v_cls_i32_e32 v10, v3
-; GFX1250TRUE16-NEXT:    v_cls_i32_e32 v9, v5
 ; GFX1250TRUE16-NEXT:    v_cls_i32_e32 v11, v1
-; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250TRUE16-NEXT:    v_dual_ashrrev_i32 v7, 31, v7 :: v_dual_ashrrev_i32 v6, 31, v6
-; GFX1250TRUE16-NEXT:    v_xor_b32_e32 v8, v0, v1
-; GFX1250TRUE16-NEXT:    v_dual_add_nc_u32 v7, 32, v7 :: v_dual_add_nc_u32 v6, 32, v6
-; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250TRUE16-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
-; GFX1250TRUE16-NEXT:    v_add_min_u32 v7, v10, -1, v7
-; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250TRUE16-NEXT:    v_add_min_u32 v6, v9, -1, v6
-; GFX1250TRUE16-NEXT:    v_lshlrev_b64_e32 v[2:3], v7, v[2:3]
-; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250TRUE16-NEXT:    v_lshlrev_b64_e32 v[4:5], v6, v[4:5]
-; GFX1250TRUE16-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX1250TRUE16-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250TRUE16-NEXT:    v_dual_add_nc_u32 v6, -1, v6 :: v_dual_bitop2_b32 v9, v0, v1 bitop3:0x14
+; GFX1250TRUE16-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
+; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250TRUE16-NEXT:    v_dual_ashrrev_i32 v9, 31, v9 :: v_dual_bitop2_b32 v8, v2, v3 bitop3:0x14
+; GFX1250TRUE16-NEXT:    v_dual_add_nc_u32 v7, 32, v7 :: v_dual_ashrrev_i32 v8, 31, v8
+; GFX1250TRUE16-NEXT:    v_dual_add_nc_u32 v10, -1, v10 :: v_dual_add_nc_u32 v11, -1, v11
 ; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250TRUE16-NEXT:    v_add_nc_u32_e32 v9, 32, v9
+; GFX1250TRUE16-NEXT:    v_min_u32_e32 v6, v6, v7
+; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250TRUE16-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX1250TRUE16-NEXT:    v_lshlrev_b64_e32 v[4:5], v6, v[4:5]
+; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250TRUE16-NEXT:    v_min_u32_e32 v7, v10, v8
+; GFX1250TRUE16-NEXT:    v_min_u32_e32 v8, v11, v9
+; GFX1250TRUE16-NEXT:    v_lshlrev_b64_e32 v[2:3], v7, v[2:3]
+; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1250TRUE16-NEXT:    v_lshlrev_b64_e32 v[0:1], v8, v[0:1]
 ; GFX1250TRUE16-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX1250TRUE16-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250TRUE16-NEXT:    v_add_min_u32 v8, v11, -1, v8
-; GFX1250TRUE16-NEXT:    v_dual_sub_nc_u32 v3, 32, v6 :: v_dual_bitop2_b32 v4, v5, v4 bitop3:0x54
+; GFX1250TRUE16-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250TRUE16-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX1250TRUE16-NEXT:    v_lshlrev_b64_e32 v[0:1], v8, v[0:1]
-; GFX1250TRUE16-NEXT:    v_sub_nc_u32_e32 v5, 32, v8
-; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250TRUE16-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX1250TRUE16-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX1250TRUE16-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250TRUE16-NEXT:    v_dual_sub_nc_u32 v5, 32, v8 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54
+; GFX1250TRUE16-NEXT:    v_dual_sub_nc_u32 v3, 32, v6 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54
+; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX1250TRUE16-NEXT:    v_cvt_f32_i32_e32 v1, v4
 ; GFX1250TRUE16-NEXT:    v_sub_nc_u32_e32 v4, 32, v7
-; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250TRUE16-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1250TRUE16-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX1250TRUE16-NEXT:    v_ldexp_f32 v1, v1, v3
 ; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -40017,44 +40019,47 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX1250FAKE16:       ; %bb.0:
 ; GFX1250FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1250FAKE16-NEXT:    v_xor_b32_e32 v8, v4, v5
-; GFX1250FAKE16-NEXT:    v_xor_b32_e32 v6, v2, v3
+; GFX1250FAKE16-NEXT:    v_cls_i32_e32 v6, v5
+; GFX1250FAKE16-NEXT:    v_xor_b32_e32 v7, v2, v3
 ; GFX1250FAKE16-NEXT:    v_cls_i32_e32 v10, v3
-; GFX1250FAKE16-NEXT:    v_cls_i32_e32 v9, v5
 ; GFX1250FAKE16-NEXT:    v_cls_i32_e32 v11, v1
-; GFX1250FAKE16-NEXT:    v_dual_ashrrev_i32 v8, 31, v8 :: v_dual_bitop2_b32 v7, v0, v1 bitop3:0x14
-; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250FAKE16-NEXT:    v_dual_ashrrev_i32 v6, 31, v6 :: v_dual_ashrrev_i32 v7, 31, v7
-; GFX1250FAKE16-NEXT:    v_dual_add_nc_u32 v6, 32, v6 :: v_dual_add_nc_u32 v7, 32, v7
-; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250FAKE16-NEXT:    v_add_min_u32 v6, v10, -1, v6
-; GFX1250FAKE16-NEXT:    v_add_min_u32 v7, v11, -1, v7
+; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250FAKE16-NEXT:    v_dual_add_nc_u32 v6, -1, v6 :: v_dual_bitop2_b32 v8, v4, v5 bitop3:0x14
+; GFX1250FAKE16-NEXT:    v_dual_ashrrev_i32 v7, 31, v7 :: v_dual_bitop2_b32 v9, v0, v1 bitop3:0x14
 ; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250FAKE16-NEXT:    v_lshlrev_b64_e32 v[2:3], v6, v[2:3]
-; GFX1250FAKE16-NEXT:    v_lshlrev_b64_e32 v[0:1], v7, v[0:1]
-; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1250FAKE16-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX1250FAKE16-NEXT:    v_add_nc_u32_e32 v8, 32, v8
-; GFX1250FAKE16-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX1250FAKE16-NEXT:    v_dual_add_nc_u32 v10, -1, v10 :: v_dual_ashrrev_i32 v8, 31, v8
+; GFX1250FAKE16-NEXT:    v_dual_add_nc_u32 v11, -1, v11 :: v_dual_ashrrev_i32 v9, 31, v9
+; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250FAKE16-NEXT:    v_dual_add_nc_u32 v7, 32, v7 :: v_dual_add_nc_u32 v8, 32, v8
+; GFX1250FAKE16-NEXT:    v_add_nc_u32_e32 v9, 32, v9
+; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250FAKE16-NEXT:    v_min_u32_e32 v7, v10, v7
+; GFX1250FAKE16-NEXT:    v_min_u32_e32 v6, v6, v8
 ; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250FAKE16-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX1250FAKE16-NEXT:    v_add_min_u32 v8, v9, -1, v8
+; GFX1250FAKE16-NEXT:    v_min_u32_e32 v9, v11, v9
+; GFX1250FAKE16-NEXT:    v_lshlrev_b64_e32 v[2:3], v7, v[2:3]
 ; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250FAKE16-NEXT:    v_dual_sub_nc_u32 v3, 32, v6 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54
-; GFX1250FAKE16-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1250FAKE16-NEXT:    v_lshlrev_b64_e32 v[4:5], v8, v[4:5]
-; GFX1250FAKE16-NEXT:    v_sub_nc_u32_e32 v8, 32, v8
-; GFX1250FAKE16-NEXT:    v_ldexp_f32 v2, v2, v3
-; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250FAKE16-NEXT:    v_lshlrev_b64_e32 v[4:5], v6, v[4:5]
+; GFX1250FAKE16-NEXT:    v_lshlrev_b64_e32 v[0:1], v9, v[0:1]
+; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250FAKE16-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX1250FAKE16-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX1250FAKE16-NEXT:    v_dual_sub_nc_u32 v4, 32, v7 :: v_dual_bitop2_b32 v1, v5, v4 bitop3:0x54
+; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX1250FAKE16-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX1250FAKE16-NEXT:    v_sub_nc_u32_e32 v6, 32, v6
+; GFX1250FAKE16-NEXT:    v_dual_sub_nc_u32 v3, 32, v7 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54
+; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1250FAKE16-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX1250FAKE16-NEXT:    v_dual_sub_nc_u32 v4, 32, v9 :: v_dual_bitop2_b32 v1, v5, v4 bitop3:0x54
+; GFX1250FAKE16-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250FAKE16-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1250FAKE16-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250FAKE16-NEXT:    v_ldexp_f32 v2, v2, v3
 ; GFX1250FAKE16-NEXT:    v_ldexp_f32 v0, v0, v4
-; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250FAKE16-NEXT:    v_ldexp_f32 v1, v1, v8
+; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250FAKE16-NEXT:    v_ldexp_f32 v1, v1, v6
 ; GFX1250FAKE16-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v2
 ; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1250FAKE16-NEXT:    v_cvt_pk_bf16_f32 v1, v1, s0
@@ -40644,51 +40649,54 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_xor_b32_e32 v9, v4, v5
+; GFX1250-NEXT:    v_cls_i32_e32 v9, v7
 ; GFX1250-NEXT:    v_xor_b32_e32 v8, v6, v7
-; GFX1250-NEXT:    v_cls_i32_e32 v12, v7
-; GFX1250-NEXT:    v_cls_i32_e32 v13, v5
-; GFX1250-NEXT:    v_cls_i32_e32 v14, v3
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1250-NEXT:    v_dual_ashrrev_i32 v9, 31, v9 :: v_dual_ashrrev_i32 v8, 31, v8
-; GFX1250-NEXT:    v_xor_b32_e32 v10, v2, v3
-; GFX1250-NEXT:    v_cls_i32_e32 v15, v1
-; GFX1250-NEXT:    v_dual_add_nc_u32 v9, 32, v9 :: v_dual_add_nc_u32 v8, 32, v8
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_dual_ashrrev_i32 v10, 31, v10 :: v_dual_bitop2_b32 v11, v0, v1 bitop3:0x14
-; GFX1250-NEXT:    v_add_min_u32 v9, v13, -1, v9
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT:    v_add_min_u32 v8, v12, -1, v8
-; GFX1250-NEXT:    v_dual_ashrrev_i32 v11, 31, v11 :: v_dual_add_nc_u32 v10, 32, v10
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT:    v_lshlrev_b64_e32 v[4:5], v9, v[4:5]
-; GFX1250-NEXT:    v_lshlrev_b64_e32 v[6:7], v8, v[6:7]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT:    v_add_nc_u32_e32 v11, 32, v11
-; GFX1250-NEXT:    v_add_min_u32 v10, v14, -1, v10
+; GFX1250-NEXT:    v_cls_i32_e32 v10, v5
+; GFX1250-NEXT:    v_xor_b32_e32 v14, v0, v1
+; GFX1250-NEXT:    v_cls_i32_e32 v12, v3
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT:    v_dual_add_nc_u32 v9, -1, v9 :: v_dual_ashrrev_i32 v8, 31, v8
+; GFX1250-NEXT:    v_dual_add_nc_u32 v10, -1, v10 :: v_dual_bitop2_b32 v11, v4, v5 bitop3:0x14
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_add_min_u32 v11, v15, -1, v11
-; GFX1250-NEXT:    v_lshlrev_b64_e32 v[2:3], v10, v[2:3]
-; GFX1250-NEXT:    v_min_u32_e32 v6, 1, v6
-; GFX1250-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX1250-NEXT:    v_dual_add_nc_u32 v8, 32, v8 :: v_dual_bitop2_b32 v13, v2, v3 bitop3:0x14
+; GFX1250-NEXT:    v_ashrrev_i32_e32 v11, 31, v11
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_min_u32_e32 v8, v9, v8
+; GFX1250-NEXT:    v_ashrrev_i32_e32 v9, 31, v13
+; GFX1250-NEXT:    v_cls_i32_e32 v13, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_dual_ashrrev_i32 v14, 31, v14 :: v_dual_add_nc_u32 v11, 32, v11
+; GFX1250-NEXT:    v_dual_add_nc_u32 v12, -1, v12 :: v_dual_add_nc_u32 v9, 32, v9
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_dual_add_nc_u32 v13, -1, v13 :: v_dual_add_nc_u32 v14, 32, v14
+; GFX1250-NEXT:    v_min_u32_e32 v10, v10, v11
+; GFX1250-NEXT:    v_lshlrev_b64_e32 v[6:7], v8, v[6:7]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT:    v_min_u32_e32 v9, v12, v9
+; GFX1250-NEXT:    v_min_u32_e32 v11, v13, v14
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_lshlrev_b64_e32 v[4:5], v10, v[4:5]
+; GFX1250-NEXT:    v_lshlrev_b64_e32 v[2:3], v9, v[2:3]
+; GFX1250-NEXT:    v_min_u32_e32 v6, 1, v6
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1250-NEXT:    v_lshlrev_b64_e32 v[0:1], v11, v[0:1]
+; GFX1250-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1250-NEXT:    v_or_b32_e32 v6, v7, v6
 ; GFX1250-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1250-NEXT:    v_dual_sub_nc_u32 v5, 32, v10 :: v_dual_bitop2_b32 v4, v5, v4 bitop3:0x54
-; GFX1250-NEXT:    v_sub_nc_u32_e32 v7, 32, v9
+; GFX1250-NEXT:    v_sub_nc_u32_e32 v7, 32, v10
 ; GFX1250-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX1250-NEXT:    v_dual_sub_nc_u32 v3, 32, v8 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT:    v_cvt_f32_i32_e32 v4, v4
+; GFX1250-NEXT:    v_dual_sub_nc_u32 v5, 32, v9 :: v_dual_bitop2_b32 v4, v5, v4 bitop3:0x54
+; GFX1250-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX1250-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX1250-NEXT:    v_cvt_f32_i32_e32 v1, v6
+; GFX1250-NEXT:    v_sub_nc_u32_e32 v1, 32, v8
+; GFX1250-NEXT:    v_cvt_f32_i32_e32 v3, v6
+; GFX1250-NEXT:    v_cvt_f32_i32_e32 v4, v4
 ; GFX1250-NEXT:    v_sub_nc_u32_e32 v6, 32, v11
 ; GFX1250-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1250-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX1250-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX1250-NEXT:    v_ldexp_f32 v1, v3, v1
 ; GFX1250-NEXT:    v_ldexp_f32 v3, v4, v7
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1250-NEXT:    v_ldexp_f32 v2, v2, v5
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
index afd0f01..6831380 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
@@ -415,28 +415,18 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
 ; GISEL-GFX942-LABEL: memcpy_known:
 ; GISEL-GFX942:       ; %bb.0:
 ; GISEL-GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GISEL-GFX942-NEXT:    s_load_dword s7, s[4:5], 0x54
 ; GISEL-GFX942-NEXT:    s_load_dword s11, s[4:5], 0x34
-; GISEL-GFX942-NEXT:    s_mov_b32 s7, 0
 ; GISEL-GFX942-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x44
-; GISEL-GFX942-NEXT:    s_mov_b32 s8, s7
+; GISEL-GFX942-NEXT:    s_mov_b32 s16, 0
+; GISEL-GFX942-NEXT:    v_mov_b32_e32 v0, 0x2000
 ; GISEL-GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX942-NEXT:    s_mov_b32 s6, s1
+; GISEL-GFX942-NEXT:    s_mov_b32 s8, s1
 ; GISEL-GFX942-NEXT:    s_mov_b32 s9, s2
-; GISEL-GFX942-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
-; GISEL-GFX942-NEXT:    s_mov_b32 s6, s3
-; GISEL-GFX942-NEXT:    s_load_dword s3, s[4:5], 0x54
-; GISEL-GFX942-NEXT:    s_mov_b32 s10, s7
-; GISEL-GFX942-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
-; GISEL-GFX942-NEXT:    s_mov_b32 s6, s13
-; GISEL-GFX942-NEXT:    s_mov_b32 s4, s7
+; GISEL-GFX942-NEXT:    s_mov_b32 s10, s3
+; GISEL-GFX942-NEXT:    s_mov_b32 s4, s13
 ; GISEL-GFX942-NEXT:    s_mov_b32 s5, s14
-; GISEL-GFX942-NEXT:    s_mov_b32 s16, 0
-; GISEL-GFX942-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
 ; GISEL-GFX942-NEXT:    s_mov_b32 s6, s15
-; GISEL-GFX942-NEXT:    s_mov_b32 s2, s7
-; GISEL-GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX942-NEXT:    s_or_b64 s[6:7], s[6:7], s[2:3]
-; GISEL-GFX942-NEXT:    v_mov_b32_e32 v0, 0x2000
 ; GISEL-GFX942-NEXT:    v_mov_b32_e32 v1, s16
 ; GISEL-GFX942-NEXT:  .LBB0_1: ; %load-store-loop
 ; GISEL-GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -491,25 +481,16 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
 ; GISEL-GFX1100-NEXT:    s_load_b128 s[8:11], s[4:5], 0x44
 ; GISEL-GFX1100-NEXT:    s_load_b32 s7, s[4:5], 0x34
 ; GISEL-GFX1100-NEXT:    s_load_b32 s15, s[4:5], 0x54
-; GISEL-GFX1100-NEXT:    s_mov_b32 s17, 0
-; GISEL-GFX1100-NEXT:    s_mov_b32 s12, 0
-; GISEL-GFX1100-NEXT:    s_mov_b32 s4, s17
-; GISEL-GFX1100-NEXT:    s_mov_b32 s6, s17
-; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v0, s12
-; GISEL-GFX1100-NEXT:    s_mov_b32 s14, s17
+; GISEL-GFX1100-NEXT:    s_mov_b32 s4, 0
+; GISEL-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v0, s4
 ; GISEL-GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX1100-NEXT:    s_mov_b32 s16, s1
+; GISEL-GFX1100-NEXT:    s_mov_b32 s4, s1
 ; GISEL-GFX1100-NEXT:    s_mov_b32 s5, s2
-; GISEL-GFX1100-NEXT:    s_mov_b32 s2, s17
-; GISEL-GFX1100-NEXT:    s_or_b64 s[4:5], s[16:17], s[4:5]
-; GISEL-GFX1100-NEXT:    s_mov_b32 s16, s3
-; GISEL-GFX1100-NEXT:    s_mov_b32 s3, s10
-; GISEL-GFX1100-NEXT:    s_or_b64 s[6:7], s[16:17], s[6:7]
-; GISEL-GFX1100-NEXT:    s_mov_b32 s16, s9
-; GISEL-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GISEL-GFX1100-NEXT:    s_or_b64 s[12:13], s[16:17], s[2:3]
-; GISEL-GFX1100-NEXT:    s_mov_b32 s16, s11
-; GISEL-GFX1100-NEXT:    s_or_b64 s[14:15], s[16:17], s[14:15]
+; GISEL-GFX1100-NEXT:    s_mov_b32 s6, s3
+; GISEL-GFX1100-NEXT:    s_mov_b32 s12, s9
+; GISEL-GFX1100-NEXT:    s_mov_b32 s13, s10
+; GISEL-GFX1100-NEXT:    s_mov_b32 s14, s11
 ; GISEL-GFX1100-NEXT:  .LBB0_1: ; %load-store-loop
 ; GISEL-GFX1100-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GISEL-GFX1100-NEXT:    v_add_nc_u32_e32 v61, s0, v0
@@ -960,28 +941,18 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; GISEL-GFX942-LABEL: memcpy_known_medium:
 ; GISEL-GFX942:       ; %bb.0:
 ; GISEL-GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GISEL-GFX942-NEXT:    s_load_dword s7, s[4:5], 0x54
 ; GISEL-GFX942-NEXT:    s_load_dword s11, s[4:5], 0x34
-; GISEL-GFX942-NEXT:    s_mov_b32 s7, 0
 ; GISEL-GFX942-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x44
-; GISEL-GFX942-NEXT:    s_mov_b32 s8, s7
+; GISEL-GFX942-NEXT:    s_mov_b32 s16, 0
+; GISEL-GFX942-NEXT:    v_mov_b32_e32 v0, 0x100
 ; GISEL-GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX942-NEXT:    s_mov_b32 s6, s1
+; GISEL-GFX942-NEXT:    s_mov_b32 s8, s1
 ; GISEL-GFX942-NEXT:    s_mov_b32 s9, s2
-; GISEL-GFX942-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
-; GISEL-GFX942-NEXT:    s_mov_b32 s6, s3
-; GISEL-GFX942-NEXT:    s_load_dword s3, s[4:5], 0x54
-; GISEL-GFX942-NEXT:    s_mov_b32 s10, s7
-; GISEL-GFX942-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
-; GISEL-GFX942-NEXT:    s_mov_b32 s6, s13
-; GISEL-GFX942-NEXT:    s_mov_b32 s4, s7
+; GISEL-GFX942-NEXT:    s_mov_b32 s10, s3
+; GISEL-GFX942-NEXT:    s_mov_b32 s4, s13
 ; GISEL-GFX942-NEXT:    s_mov_b32 s5, s14
-; GISEL-GFX942-NEXT:    s_mov_b32 s16, 0
-; GISEL-GFX942-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
 ; GISEL-GFX942-NEXT:    s_mov_b32 s6, s15
-; GISEL-GFX942-NEXT:    s_mov_b32 s2, s7
-; GISEL-GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX942-NEXT:    s_or_b64 s[6:7], s[6:7], s[2:3]
-; GISEL-GFX942-NEXT:    v_mov_b32_e32 v0, 0x100
 ; GISEL-GFX942-NEXT:    v_mov_b32_e32 v1, s16
 ; GISEL-GFX942-NEXT:  .LBB1_1: ; %load-store-loop
 ; GISEL-GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -1036,25 +1007,16 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; GISEL-GFX1100-NEXT:    s_load_b128 s[8:11], s[4:5], 0x44
 ; GISEL-GFX1100-NEXT:    s_load_b32 s7, s[4:5], 0x34
 ; GISEL-GFX1100-NEXT:    s_load_b32 s15, s[4:5], 0x54
-; GISEL-GFX1100-NEXT:    s_mov_b32 s17, 0
-; GISEL-GFX1100-NEXT:    s_mov_b32 s12, 0
-; GISEL-GFX1100-NEXT:    s_mov_b32 s4, s17
-; GISEL-GFX1100-NEXT:    s_mov_b32 s6, s17
-; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v0, s12
-; GISEL-GFX1100-NEXT:    s_mov_b32 s14, s17
+; GISEL-GFX1100-NEXT:    s_mov_b32 s4, 0
+; GISEL-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v0, s4
 ; GISEL-GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX1100-NEXT:    s_mov_b32 s16, s1
+; GISEL-GFX1100-NEXT:    s_mov_b32 s4, s1
 ; GISEL-GFX1100-NEXT:    s_mov_b32 s5, s2
-; GISEL-GFX1100-NEXT:    s_mov_b32 s2, s17
-; GISEL-GFX1100-NEXT:    s_or_b64 s[4:5], s[16:17], s[4:5]
-; GISEL-GFX1100-NEXT:    s_mov_b32 s16, s3
-; GISEL-GFX1100-NEXT:    s_mov_b32 s3, s10
-; GISEL-GFX1100-NEXT:    s_or_b64 s[6:7], s[16:17], s[6:7]
-; GISEL-GFX1100-NEXT:    s_mov_b32 s16, s9
-; GISEL-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GISEL-GFX1100-NEXT:    s_or_b64 s[12:13], s[16:17], s[2:3]
-; GISEL-GFX1100-NEXT:    s_mov_b32 s16, s11
-; GISEL-GFX1100-NEXT:    s_or_b64 s[14:15], s[16:17], s[14:15]
+; GISEL-GFX1100-NEXT:    s_mov_b32 s6, s3
+; GISEL-GFX1100-NEXT:    s_mov_b32 s12, s9
+; GISEL-GFX1100-NEXT:    s_mov_b32 s13, s10
+; GISEL-GFX1100-NEXT:    s_mov_b32 s14, s11
 ; GISEL-GFX1100-NEXT:  .LBB1_1: ; %load-store-loop
 ; GISEL-GFX1100-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GISEL-GFX1100-NEXT:    v_add_nc_u32_e32 v61, s0, v0
@@ -1228,27 +1190,18 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa
 ; GISEL-GFX942:       ; %bb.0:
 ; GISEL-GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GISEL-GFX942-NEXT:    s_load_dword s11, s[4:5], 0x34
-; GISEL-GFX942-NEXT:    s_mov_b32 s7, 0
-; GISEL-GFX942-NEXT:    s_mov_b32 s8, s7
-; GISEL-GFX942-NEXT:    s_mov_b32 s10, s7
 ; GISEL-GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX942-NEXT:    s_mov_b32 s6, s1
+; GISEL-GFX942-NEXT:    s_mov_b32 s8, s1
 ; GISEL-GFX942-NEXT:    s_mov_b32 s9, s2
-; GISEL-GFX942-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
-; GISEL-GFX942-NEXT:    s_mov_b32 s6, s3
-; GISEL-GFX942-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
+; GISEL-GFX942-NEXT:    s_mov_b32 s10, s3
 ; GISEL-GFX942-NEXT:    v_mov_b32_e32 v4, s0
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[0:3], v4, s[8:11], 0 offen
 ; GISEL-GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x44
-; GISEL-GFX942-NEXT:    s_load_dword s13, s[4:5], 0x54
-; GISEL-GFX942-NEXT:    s_mov_b32 s4, s7
-; GISEL-GFX942-NEXT:    s_mov_b32 s12, s7
+; GISEL-GFX942-NEXT:    s_load_dword s7, s[4:5], 0x54
 ; GISEL-GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX942-NEXT:    s_mov_b32 s6, s1
+; GISEL-GFX942-NEXT:    s_mov_b32 s4, s1
 ; GISEL-GFX942-NEXT:    s_mov_b32 s5, s2
-; GISEL-GFX942-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
 ; GISEL-GFX942-NEXT:    s_mov_b32 s6, s3
-; GISEL-GFX942-NEXT:    s_or_b64 s[6:7], s[6:7], s[12:13]
 ; GISEL-GFX942-NEXT:    v_mov_b32_e32 v5, s0
 ; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[0:3], v5, s[4:7], 0 offen
@@ -1261,35 +1214,24 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa
 ; GISEL-GFX1100:       ; %bb.0:
 ; GISEL-GFX1100-NEXT:    s_clause 0x1
 ; GISEL-GFX1100-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GISEL-GFX1100-NEXT:    s_load_b32 s7, s[4:5], 0x34
-; GISEL-GFX1100-NEXT:    s_mov_b32 s13, 0
-; GISEL-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-GFX1100-NEXT:    s_mov_b32 s8, s13
-; GISEL-GFX1100-NEXT:    s_mov_b32 s6, s13
+; GISEL-GFX1100-NEXT:    s_load_b32 s11, s[4:5], 0x34
 ; GISEL-GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX1100-NEXT:    s_mov_b32 s12, s1
-; GISEL-GFX1100-NEXT:    s_mov_b32 s9, s2
 ; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v4, s0
-; GISEL-GFX1100-NEXT:    s_or_b64 s[0:1], s[12:13], s[8:9]
-; GISEL-GFX1100-NEXT:    s_mov_b32 s12, s3
-; GISEL-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-GFX1100-NEXT:    s_or_b64 s[2:3], s[12:13], s[6:7]
-; GISEL-GFX1100-NEXT:    buffer_load_b128 v[0:3], v4, s[0:3], 0 offen
+; GISEL-GFX1100-NEXT:    s_mov_b32 s8, s1
+; GISEL-GFX1100-NEXT:    s_mov_b32 s9, s2
+; GISEL-GFX1100-NEXT:    s_mov_b32 s10, s3
+; GISEL-GFX1100-NEXT:    buffer_load_b128 v[0:3], v4, s[8:11], 0 offen
 ; GISEL-GFX1100-NEXT:    s_clause 0x1
-; GISEL-GFX1100-NEXT:    s_load_b128 s[8:11], s[4:5], 0x44
+; GISEL-GFX1100-NEXT:    s_load_b128 s[0:3], s[4:5], 0x44
 ; GISEL-GFX1100-NEXT:    s_load_b32 s7, s[4:5], 0x54
-; GISEL-GFX1100-NEXT:    s_mov_b32 s4, s13
 ; GISEL-GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX1100-NEXT:    s_mov_b32 s12, s9
-; GISEL-GFX1100-NEXT:    s_mov_b32 s5, s10
-; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v5, s8
-; GISEL-GFX1100-NEXT:    s_or_b64 s[4:5], s[12:13], s[4:5]
-; GISEL-GFX1100-NEXT:    s_mov_b32 s12, s11
-; GISEL-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-GFX1100-NEXT:    s_or_b64 s[6:7], s[12:13], s[6:7]
+; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v5, s0
+; GISEL-GFX1100-NEXT:    s_mov_b32 s4, s1
+; GISEL-GFX1100-NEXT:    s_mov_b32 s5, s2
+; GISEL-GFX1100-NEXT:    s_mov_b32 s6, s3
 ; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-GFX1100-NEXT:    buffer_store_b128 v[0:3], v5, s[4:7], 0 offen
-; GISEL-GFX1100-NEXT:    buffer_load_b128 v[0:3], v4, s[0:3], 0 offen offset:16
+; GISEL-GFX1100-NEXT:    buffer_load_b128 v[0:3], v4, s[8:11], 0 offen offset:16
 ; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-GFX1100-NEXT:    buffer_store_b128 v[0:3], v5, s[4:7], 0 offen offset:16
 ; GISEL-GFX1100-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index 4a6fa4f..b96de17 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -704,7 +704,6 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; CISI-NEXT:    s_add_u32 s4, s4, s6
 ; CISI-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; CISI-NEXT:    s_or_b32 s6, s12, s13
-; CISI-NEXT:    s_cmp_lg_u32 s6, 0
 ; CISI-NEXT:    s_addc_u32 s5, s5, s7
 ; CISI-NEXT:    s_mov_b32 s8, s0
 ; CISI-NEXT:    s_mov_b32 s9, s1
@@ -725,16 +724,14 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    s_add_u32 s2, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_add_u32 s0, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_addc_u32 s1, s5, s7
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; VI-NEXT:    s_addc_u32 s0, s5, s7
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    v_mov_b32_e32 v5, s0
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -746,12 +743,10 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s2, s12, s14
-; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    s_addc_u32 s0, s13, s15
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_add_u32 s0, s12, s14
+; GFX9-NEXT:    s_addc_u32 s1, s13, s15
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -764,10 +759,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1010-NEXT:    s_add_u32 s0, s12, s14
-; GFX1010-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX1010-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1010-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1010-NEXT:    s_addc_u32 s1, s13, s15
+; GFX1010-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1010-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX1010-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1010-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
@@ -781,10 +774,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1030W32-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1030W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1030W32-NEXT:    s_add_u32 s4, s4, s6
-; GFX1030W32-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX1030W32-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1030W32-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX1030W32-NEXT:    s_addc_u32 s5, s5, s7
+; GFX1030W32-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1030W32-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX1030W32-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX1030W32-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
@@ -798,10 +789,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1030W64-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1030W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1030W64-NEXT:    s_add_u32 s4, s4, s6
-; GFX1030W64-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX1030W64-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1030W64-NEXT:    s_cmp_lg_u64 s[8:9], 0
 ; GFX1030W64-NEXT:    s_addc_u32 s5, s5, s7
+; GFX1030W64-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1030W64-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX1030W64-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX1030W64-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
@@ -814,10 +803,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_add_u32 s4, s4, s6
-; GFX11-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s4
-; GFX11-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX11-NEXT:    s_addc_u32 s5, s5, s7
+; GFX11-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
@@ -831,10 +818,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_add_co_u32 s0, s12, s14
-; GFX1250-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX1250-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
-; GFX1250-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1250-NEXT:    s_add_co_ci_u32 s1, s13, s15
+; GFX1250-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
 ; GFX1250-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1250-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
@@ -1691,7 +1676,6 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; CISI-NEXT:    s_sub_u32 s4, s4, s6
 ; CISI-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; CISI-NEXT:    s_or_b32 s6, s12, s13
-; CISI-NEXT:    s_cmp_lg_u32 s6, 0
 ; CISI-NEXT:    s_subb_u32 s5, s5, s7
 ; CISI-NEXT:    s_mov_b32 s8, s0
 ; CISI-NEXT:    s_mov_b32 s9, s1
@@ -1712,16 +1696,14 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    s_sub_u32 s2, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_sub_u32 s0, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_subb_u32 s1, s5, s7
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; VI-NEXT:    s_subb_u32 s0, s5, s7
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    v_mov_b32_e32 v5, s0
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -1733,12 +1715,10 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sub_u32 s2, s12, s14
-; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    s_subb_u32 s0, s13, s15
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_sub_u32 s0, s12, s14
+; GFX9-NEXT:    s_subb_u32 s1, s13, s15
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -1751,10 +1731,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1010-NEXT:    s_sub_u32 s0, s12, s14
-; GFX1010-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX1010-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1010-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1010-NEXT:    s_subb_u32 s1, s13, s15
+; GFX1010-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1010-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX1010-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1010-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
@@ -1768,10 +1746,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1030W32-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1030W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1030W32-NEXT:    s_sub_u32 s4, s4, s6
-; GFX1030W32-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX1030W32-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1030W32-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX1030W32-NEXT:    s_subb_u32 s5, s5, s7
+; GFX1030W32-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1030W32-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX1030W32-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX1030W32-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
@@ -1785,10 +1761,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1030W64-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1030W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1030W64-NEXT:    s_sub_u32 s4, s4, s6
-; GFX1030W64-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX1030W64-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1030W64-NEXT:    s_cmp_lg_u64 s[8:9], 0
 ; GFX1030W64-NEXT:    s_subb_u32 s5, s5, s7
+; GFX1030W64-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1030W64-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX1030W64-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX1030W64-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
@@ -1801,10 +1775,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_sub_u32 s4, s4, s6
-; GFX11-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s4
-; GFX11-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX11-NEXT:    s_subb_u32 s5, s5, s7
+; GFX11-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
@@ -1818,10 +1790,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_sub_co_u32 s0, s12, s14
-; GFX1250-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX1250-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
-; GFX1250-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1250-NEXT:    s_sub_co_ci_u32 s1, s13, s15
+; GFX1250-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
 ; GFX1250-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1250-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
@@ -2218,49 +2188,46 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; VI-NEXT:    s_addc_u32 s6, s7, s9
 ; VI-NEXT:    s_addc_u32 s8, s8, 0
 ; VI-NEXT:    v_readfirstlane_b32 s7, v0
-; VI-NEXT:    s_add_u32 s12, s6, s7
-; VI-NEXT:    v_mov_b32_e32 v0, s12
+; VI-NEXT:    s_add_u32 s10, s6, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s10
 ; VI-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], s4, v0, 0
-; VI-NEXT:    s_addc_u32 s13, 0, s8
-; VI-NEXT:    s_mul_i32 s8, s4, s13
+; VI-NEXT:    s_addc_u32 s11, 0, s8
+; VI-NEXT:    s_mul_i32 s8, s4, s11
 ; VI-NEXT:    v_readfirstlane_b32 s9, v1
 ; VI-NEXT:    s_add_i32 s8, s9, s8
-; VI-NEXT:    s_mul_i32 s9, s5, s12
-; VI-NEXT:    s_add_i32 s14, s8, s9
-; VI-NEXT:    s_sub_i32 s10, s3, s14
+; VI-NEXT:    s_mul_i32 s9, s5, s10
+; VI-NEXT:    s_add_i32 s12, s8, s9
+; VI-NEXT:    s_sub_i32 s13, s3, s12
 ; VI-NEXT:    v_readfirstlane_b32 s8, v0
-; VI-NEXT:    s_sub_u32 s15, s2, s8
+; VI-NEXT:    s_sub_u32 s14, s2, s8
 ; VI-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; VI-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; VI-NEXT:    s_subb_u32 s16, s10, s5
-; VI-NEXT:    s_sub_u32 s17, s15, s4
-; VI-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; VI-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; VI-NEXT:    s_subb_u32 s10, s16, 0
-; VI-NEXT:    s_cmp_ge_u32 s10, s5
-; VI-NEXT:    s_cselect_b32 s11, -1, 0
-; VI-NEXT:    s_cmp_ge_u32 s17, s4
+; VI-NEXT:    s_subb_u32 s13, s13, s5
+; VI-NEXT:    s_sub_u32 s15, s14, s4
+; VI-NEXT:    s_subb_u32 s13, s13, 0
+; VI-NEXT:    s_cmp_ge_u32 s13, s5
 ; VI-NEXT:    s_cselect_b32 s16, -1, 0
-; VI-NEXT:    s_cmp_eq_u32 s10, s5
-; VI-NEXT:    s_cselect_b32 s10, s16, s11
-; VI-NEXT:    s_add_u32 s11, s12, 1
-; VI-NEXT:    s_addc_u32 s16, s13, 0
-; VI-NEXT:    s_add_u32 s17, s12, 2
-; VI-NEXT:    s_addc_u32 s18, s13, 0
-; VI-NEXT:    s_cmp_lg_u32 s10, 0
-; VI-NEXT:    s_cselect_b32 s10, s17, s11
-; VI-NEXT:    s_cselect_b32 s11, s18, s16
+; VI-NEXT:    s_cmp_ge_u32 s15, s4
+; VI-NEXT:    s_cselect_b32 s15, -1, 0
+; VI-NEXT:    s_cmp_eq_u32 s13, s5
+; VI-NEXT:    s_cselect_b32 s13, s15, s16
+; VI-NEXT:    s_add_u32 s15, s10, 1
+; VI-NEXT:    s_addc_u32 s16, s11, 0
+; VI-NEXT:    s_add_u32 s17, s10, 2
+; VI-NEXT:    s_addc_u32 s18, s11, 0
+; VI-NEXT:    s_cmp_lg_u32 s13, 0
+; VI-NEXT:    s_cselect_b32 s13, s17, s15
+; VI-NEXT:    s_cselect_b32 s15, s18, s16
 ; VI-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; VI-NEXT:    s_subb_u32 s3, s3, s14
+; VI-NEXT:    s_subb_u32 s3, s3, s12
 ; VI-NEXT:    s_cmp_ge_u32 s3, s5
 ; VI-NEXT:    s_cselect_b32 s8, -1, 0
-; VI-NEXT:    s_cmp_ge_u32 s15, s4
+; VI-NEXT:    s_cmp_ge_u32 s14, s4
 ; VI-NEXT:    s_cselect_b32 s9, -1, 0
 ; VI-NEXT:    s_cmp_eq_u32 s3, s5
 ; VI-NEXT:    s_cselect_b32 s3, s9, s8
 ; VI-NEXT:    s_cmp_lg_u32 s3, 0
-; VI-NEXT:    s_cselect_b32 s9, s11, s13
-; VI-NEXT:    s_cselect_b32 s8, s10, s12
+; VI-NEXT:    s_cselect_b32 s9, s15, s11
+; VI-NEXT:    s_cselect_b32 s8, s13, s10
 ; VI-NEXT:    s_cbranch_execnz .LBB16_4
 ; VI-NEXT:  .LBB16_2:
 ; VI-NEXT:    v_cvt_f32_u32_e32 v0, s4
@@ -2311,8 +2278,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX9-NEXT:  ; %bb.1:
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
-; GFX9-NEXT:    s_sub_u32 s10, 0, s6
-; GFX9-NEXT:    s_subb_u32 s11, 0, s7
+; GFX9-NEXT:    s_sub_u32 s8, 0, s6
+; GFX9-NEXT:    s_subb_u32 s9, 0, s7
 ; GFX9-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2321,109 +2288,102 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX9-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s12, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX9-NEXT:    s_mul_i32 s9, s10, s12
-; GFX9-NEXT:    s_mul_hi_u32 s14, s10, s8
-; GFX9-NEXT:    s_mul_i32 s13, s11, s8
-; GFX9-NEXT:    s_add_i32 s9, s14, s9
-; GFX9-NEXT:    s_add_i32 s9, s9, s13
-; GFX9-NEXT:    s_mul_i32 s15, s10, s8
-; GFX9-NEXT:    s_mul_i32 s14, s8, s9
-; GFX9-NEXT:    s_mul_hi_u32 s16, s8, s15
-; GFX9-NEXT:    s_mul_hi_u32 s13, s8, s9
+; GFX9-NEXT:    v_readfirstlane_b32 s10, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s11, v0
+; GFX9-NEXT:    s_mul_i32 s12, s8, s10
+; GFX9-NEXT:    s_mul_hi_u32 s14, s8, s11
+; GFX9-NEXT:    s_mul_i32 s13, s9, s11
+; GFX9-NEXT:    s_add_i32 s12, s14, s12
+; GFX9-NEXT:    s_add_i32 s12, s12, s13
+; GFX9-NEXT:    s_mul_i32 s15, s8, s11
+; GFX9-NEXT:    s_mul_i32 s14, s11, s12
+; GFX9-NEXT:    s_mul_hi_u32 s16, s11, s15
+; GFX9-NEXT:    s_mul_hi_u32 s13, s11, s12
 ; GFX9-NEXT:    s_add_u32 s14, s16, s14
 ; GFX9-NEXT:    s_addc_u32 s13, 0, s13
-; GFX9-NEXT:    s_mul_hi_u32 s17, s12, s15
-; GFX9-NEXT:    s_mul_i32 s15, s12, s15
+; GFX9-NEXT:    s_mul_hi_u32 s17, s10, s15
+; GFX9-NEXT:    s_mul_i32 s15, s10, s15
 ; GFX9-NEXT:    s_add_u32 s14, s14, s15
-; GFX9-NEXT:    s_mul_hi_u32 s16, s12, s9
+; GFX9-NEXT:    s_mul_hi_u32 s16, s10, s12
 ; GFX9-NEXT:    s_addc_u32 s13, s13, s17
 ; GFX9-NEXT:    s_addc_u32 s14, s16, 0
-; GFX9-NEXT:    s_mul_i32 s9, s12, s9
-; GFX9-NEXT:    s_add_u32 s9, s13, s9
+; GFX9-NEXT:    s_mul_i32 s12, s10, s12
+; GFX9-NEXT:    s_add_u32 s12, s13, s12
 ; GFX9-NEXT:    s_addc_u32 s13, 0, s14
-; GFX9-NEXT:    s_add_u32 s14, s8, s9
-; GFX9-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; GFX9-NEXT:    s_addc_u32 s12, s12, s13
-; GFX9-NEXT:    s_mul_i32 s8, s10, s12
-; GFX9-NEXT:    s_mul_hi_u32 s9, s10, s14
-; GFX9-NEXT:    s_add_i32 s8, s9, s8
-; GFX9-NEXT:    s_mul_i32 s11, s11, s14
-; GFX9-NEXT:    s_add_i32 s8, s8, s11
-; GFX9-NEXT:    s_mul_i32 s10, s10, s14
-; GFX9-NEXT:    s_mul_hi_u32 s11, s12, s10
-; GFX9-NEXT:    s_mul_i32 s13, s12, s10
-; GFX9-NEXT:    s_mul_i32 s16, s14, s8
-; GFX9-NEXT:    s_mul_hi_u32 s10, s14, s10
-; GFX9-NEXT:    s_mul_hi_u32 s15, s14, s8
-; GFX9-NEXT:    s_add_u32 s10, s10, s16
+; GFX9-NEXT:    s_add_u32 s11, s11, s12
+; GFX9-NEXT:    s_addc_u32 s10, s10, s13
+; GFX9-NEXT:    s_mul_i32 s12, s8, s10
+; GFX9-NEXT:    s_mul_hi_u32 s13, s8, s11
+; GFX9-NEXT:    s_add_i32 s12, s13, s12
+; GFX9-NEXT:    s_mul_i32 s9, s9, s11
+; GFX9-NEXT:    s_add_i32 s12, s12, s9
+; GFX9-NEXT:    s_mul_i32 s8, s8, s11
+; GFX9-NEXT:    s_mul_hi_u32 s13, s10, s8
+; GFX9-NEXT:    s_mul_i32 s14, s10, s8
+; GFX9-NEXT:    s_mul_i32 s16, s11, s12
+; GFX9-NEXT:    s_mul_hi_u32 s8, s11, s8
+; GFX9-NEXT:    s_mul_hi_u32 s15, s11, s12
+; GFX9-NEXT:    s_add_u32 s8, s8, s16
 ; GFX9-NEXT:    s_addc_u32 s15, 0, s15
-; GFX9-NEXT:    s_add_u32 s10, s10, s13
-; GFX9-NEXT:    s_mul_hi_u32 s9, s12, s8
-; GFX9-NEXT:    s_addc_u32 s10, s15, s11
+; GFX9-NEXT:    s_add_u32 s8, s8, s14
+; GFX9-NEXT:    s_mul_hi_u32 s9, s10, s12
+; GFX9-NEXT:    s_addc_u32 s8, s15, s13
 ; GFX9-NEXT:    s_addc_u32 s9, s9, 0
-; GFX9-NEXT:    s_mul_i32 s8, s12, s8
-; GFX9-NEXT:    s_add_u32 s8, s10, s8
-; GFX9-NEXT:    s_addc_u32 s10, 0, s9
-; GFX9-NEXT:    s_add_u32 s11, s14, s8
-; GFX9-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; GFX9-NEXT:    s_addc_u32 s8, s12, s10
-; GFX9-NEXT:    s_mul_i32 s10, s2, s8
-; GFX9-NEXT:    s_mul_hi_u32 s12, s2, s11
-; GFX9-NEXT:    s_mul_hi_u32 s9, s2, s8
-; GFX9-NEXT:    s_add_u32 s10, s12, s10
+; GFX9-NEXT:    s_mul_i32 s12, s10, s12
+; GFX9-NEXT:    s_add_u32 s8, s8, s12
 ; GFX9-NEXT:    s_addc_u32 s9, 0, s9
-; GFX9-NEXT:    s_mul_hi_u32 s13, s3, s11
-; GFX9-NEXT:    s_mul_i32 s11, s3, s11
-; GFX9-NEXT:    s_add_u32 s10, s10, s11
-; GFX9-NEXT:    s_mul_hi_u32 s12, s3, s8
-; GFX9-NEXT:    s_addc_u32 s9, s9, s13
-; GFX9-NEXT:    s_addc_u32 s10, s12, 0
+; GFX9-NEXT:    s_add_u32 s8, s11, s8
+; GFX9-NEXT:    s_addc_u32 s9, s10, s9
+; GFX9-NEXT:    s_mul_i32 s11, s2, s9
+; GFX9-NEXT:    s_mul_hi_u32 s12, s2, s8
+; GFX9-NEXT:    s_mul_hi_u32 s10, s2, s9
+; GFX9-NEXT:    s_add_u32 s11, s12, s11
+; GFX9-NEXT:    s_addc_u32 s10, 0, s10
+; GFX9-NEXT:    s_mul_hi_u32 s13, s3, s8
 ; GFX9-NEXT:    s_mul_i32 s8, s3, s8
-; GFX9-NEXT:    s_add_u32 s12, s9, s8
-; GFX9-NEXT:    s_addc_u32 s13, 0, s10
-; GFX9-NEXT:    s_mul_i32 s8, s6, s13
-; GFX9-NEXT:    s_mul_hi_u32 s9, s6, s12
+; GFX9-NEXT:    s_add_u32 s8, s11, s8
+; GFX9-NEXT:    s_mul_hi_u32 s12, s3, s9
+; GFX9-NEXT:    s_addc_u32 s8, s10, s13
+; GFX9-NEXT:    s_addc_u32 s10, s12, 0
+; GFX9-NEXT:    s_mul_i32 s9, s3, s9
+; GFX9-NEXT:    s_add_u32 s11, s8, s9
+; GFX9-NEXT:    s_addc_u32 s10, 0, s10
+; GFX9-NEXT:    s_mul_i32 s8, s6, s10
+; GFX9-NEXT:    s_mul_hi_u32 s9, s6, s11
 ; GFX9-NEXT:    s_add_i32 s8, s9, s8
-; GFX9-NEXT:    s_mul_i32 s9, s7, s12
-; GFX9-NEXT:    s_add_i32 s14, s8, s9
-; GFX9-NEXT:    s_sub_i32 s10, s3, s14
-; GFX9-NEXT:    s_mul_i32 s8, s6, s12
-; GFX9-NEXT:    s_sub_u32 s15, s2, s8
+; GFX9-NEXT:    s_mul_i32 s9, s7, s11
+; GFX9-NEXT:    s_add_i32 s12, s8, s9
+; GFX9-NEXT:    s_sub_i32 s13, s3, s12
+; GFX9-NEXT:    s_mul_i32 s8, s6, s11
+; GFX9-NEXT:    s_sub_u32 s14, s2, s8
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; GFX9-NEXT:    s_subb_u32 s16, s10, s7
-; GFX9-NEXT:    s_sub_u32 s17, s15, s6
-; GFX9-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT:    s_subb_u32 s10, s16, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s10, s7
-; GFX9-NEXT:    s_cselect_b32 s11, -1, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s17, s6
+; GFX9-NEXT:    s_subb_u32 s13, s13, s7
+; GFX9-NEXT:    s_sub_u32 s15, s14, s6
+; GFX9-NEXT:    s_subb_u32 s13, s13, 0
+; GFX9-NEXT:    s_cmp_ge_u32 s13, s7
 ; GFX9-NEXT:    s_cselect_b32 s16, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s10, s7
-; GFX9-NEXT:    s_cselect_b32 s10, s16, s11
-; GFX9-NEXT:    s_add_u32 s11, s12, 1
-; GFX9-NEXT:    s_addc_u32 s16, s13, 0
-; GFX9-NEXT:    s_add_u32 s17, s12, 2
-; GFX9-NEXT:    s_addc_u32 s18, s13, 0
-; GFX9-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX9-NEXT:    s_cselect_b32 s10, s17, s11
-; GFX9-NEXT:    s_cselect_b32 s11, s18, s16
+; GFX9-NEXT:    s_cmp_ge_u32 s15, s6
+; GFX9-NEXT:    s_cselect_b32 s15, -1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s13, s7
+; GFX9-NEXT:    s_cselect_b32 s13, s15, s16
+; GFX9-NEXT:    s_add_u32 s15, s11, 1
+; GFX9-NEXT:    s_addc_u32 s16, s10, 0
+; GFX9-NEXT:    s_add_u32 s17, s11, 2
+; GFX9-NEXT:    s_addc_u32 s18, s10, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX9-NEXT:    s_cselect_b32 s13, s17, s15
+; GFX9-NEXT:    s_cselect_b32 s15, s18, s16
 ; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; GFX9-NEXT:    s_subb_u32 s3, s3, s14
+; GFX9-NEXT:    s_subb_u32 s3, s3, s12
 ; GFX9-NEXT:    s_cmp_ge_u32 s3, s7
 ; GFX9-NEXT:    s_cselect_b32 s8, -1, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s15, s6
+; GFX9-NEXT:    s_cmp_ge_u32 s14, s6
 ; GFX9-NEXT:    s_cselect_b32 s9, -1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s3, s7
 ; GFX9-NEXT:    s_cselect_b32 s3, s9, s8
 ; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX9-NEXT:    s_cselect_b32 s9, s11, s13
-; GFX9-NEXT:    s_cselect_b32 s8, s10, s12
+; GFX9-NEXT:    s_cselect_b32 s9, s15, s10
+; GFX9-NEXT:    s_cselect_b32 s8, s13, s11
 ; GFX9-NEXT:    s_cbranch_execnz .LBB16_3
 ; GFX9-NEXT:  .LBB16_2:
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
@@ -2503,44 +2463,40 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1010-NEXT:    s_add_u32 s11, s12, s11
 ; GFX1010-NEXT:    s_addc_u32 s12, 0, s13
 ; GFX1010-NEXT:    s_add_u32 s8, s8, s11
-; GFX1010-NEXT:    s_cselect_b32 s11, -1, 0
-; GFX1010-NEXT:    s_mul_hi_u32 s13, s9, s8
-; GFX1010-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX1010-NEXT:    s_mul_i32 s11, s9, s8
 ; GFX1010-NEXT:    s_addc_u32 s5, s5, s12
-; GFX1010-NEXT:    s_mul_i32 s10, s10, s8
+; GFX1010-NEXT:    s_mul_hi_u32 s11, s9, s8
+; GFX1010-NEXT:    s_mul_i32 s12, s9, s8
 ; GFX1010-NEXT:    s_mul_i32 s9, s9, s5
-; GFX1010-NEXT:    s_mul_hi_u32 s12, s8, s11
-; GFX1010-NEXT:    s_add_i32 s9, s13, s9
-; GFX1010-NEXT:    s_mul_hi_u32 s13, s5, s11
+; GFX1010-NEXT:    s_mul_i32 s10, s10, s8
+; GFX1010-NEXT:    s_add_i32 s9, s11, s9
+; GFX1010-NEXT:    s_mul_i32 s11, s5, s12
 ; GFX1010-NEXT:    s_add_i32 s9, s9, s10
-; GFX1010-NEXT:    s_mul_i32 s10, s5, s11
+; GFX1010-NEXT:    s_mul_hi_u32 s10, s8, s12
 ; GFX1010-NEXT:    s_mul_i32 s15, s8, s9
 ; GFX1010-NEXT:    s_mul_hi_u32 s14, s8, s9
-; GFX1010-NEXT:    s_add_u32 s12, s12, s15
+; GFX1010-NEXT:    s_add_u32 s10, s10, s15
+; GFX1010-NEXT:    s_mul_hi_u32 s13, s5, s12
 ; GFX1010-NEXT:    s_addc_u32 s14, 0, s14
-; GFX1010-NEXT:    s_mul_hi_u32 s11, s5, s9
-; GFX1010-NEXT:    s_add_u32 s10, s12, s10
+; GFX1010-NEXT:    s_mul_hi_u32 s12, s5, s9
+; GFX1010-NEXT:    s_add_u32 s10, s10, s11
 ; GFX1010-NEXT:    s_mul_i32 s9, s5, s9
 ; GFX1010-NEXT:    s_addc_u32 s10, s14, s13
-; GFX1010-NEXT:    s_addc_u32 s11, s11, 0
+; GFX1010-NEXT:    s_addc_u32 s11, s12, 0
 ; GFX1010-NEXT:    s_add_u32 s9, s10, s9
 ; GFX1010-NEXT:    s_addc_u32 s10, 0, s11
 ; GFX1010-NEXT:    s_add_u32 s8, s8, s9
-; GFX1010-NEXT:    s_cselect_b32 s9, -1, 0
-; GFX1010-NEXT:    s_mul_hi_u32 s11, s2, s8
-; GFX1010-NEXT:    s_cmp_lg_u32 s9, 0
-; GFX1010-NEXT:    s_mul_hi_u32 s9, s3, s8
 ; GFX1010-NEXT:    s_addc_u32 s5, s5, s10
-; GFX1010-NEXT:    s_mul_i32 s8, s3, s8
+; GFX1010-NEXT:    s_mul_hi_u32 s9, s2, s8
 ; GFX1010-NEXT:    s_mul_i32 s12, s2, s5
-; GFX1010-NEXT:    s_mul_hi_u32 s10, s2, s5
-; GFX1010-NEXT:    s_add_u32 s11, s11, s12
-; GFX1010-NEXT:    s_addc_u32 s10, 0, s10
+; GFX1010-NEXT:    s_mul_hi_u32 s11, s2, s5
+; GFX1010-NEXT:    s_mul_hi_u32 s10, s3, s8
+; GFX1010-NEXT:    s_mul_i32 s8, s3, s8
+; GFX1010-NEXT:    s_add_u32 s9, s9, s12
+; GFX1010-NEXT:    s_addc_u32 s11, 0, s11
 ; GFX1010-NEXT:    s_mul_hi_u32 s13, s3, s5
-; GFX1010-NEXT:    s_add_u32 s8, s11, s8
+; GFX1010-NEXT:    s_add_u32 s8, s9, s8
 ; GFX1010-NEXT:    s_mul_i32 s5, s3, s5
-; GFX1010-NEXT:    s_addc_u32 s8, s10, s9
+; GFX1010-NEXT:    s_addc_u32 s8, s11, s10
 ; GFX1010-NEXT:    s_addc_u32 s9, s13, 0
 ; GFX1010-NEXT:    s_add_u32 s5, s8, s5
 ; GFX1010-NEXT:    s_addc_u32 s8, 0, s9
@@ -2553,11 +2509,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1010-NEXT:    s_sub_i32 s11, s3, s9
 ; GFX1010-NEXT:    s_sub_u32 s10, s2, s10
 ; GFX1010-NEXT:    s_cselect_b32 s12, -1, 0
-; GFX1010-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX1010-NEXT:    s_subb_u32 s11, s11, s7
 ; GFX1010-NEXT:    s_sub_u32 s13, s10, s6
-; GFX1010-NEXT:    s_cselect_b32 s14, -1, 0
-; GFX1010-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX1010-NEXT:    s_subb_u32 s11, s11, 0
 ; GFX1010-NEXT:    s_cmp_ge_u32 s11, s7
 ; GFX1010-NEXT:    s_cselect_b32 s14, -1, 0
@@ -2663,44 +2616,40 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1030W32-NEXT:    s_add_u32 s11, s12, s11
 ; GFX1030W32-NEXT:    s_addc_u32 s12, 0, s13
 ; GFX1030W32-NEXT:    s_add_u32 s8, s8, s11
-; GFX1030W32-NEXT:    s_cselect_b32 s11, -1, 0
-; GFX1030W32-NEXT:    s_mul_hi_u32 s13, s9, s8
-; GFX1030W32-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX1030W32-NEXT:    s_mul_i32 s11, s9, s8
 ; GFX1030W32-NEXT:    s_addc_u32 s7, s7, s12
-; GFX1030W32-NEXT:    s_mul_i32 s10, s10, s8
+; GFX1030W32-NEXT:    s_mul_hi_u32 s11, s9, s8
+; GFX1030W32-NEXT:    s_mul_i32 s12, s9, s8
 ; GFX1030W32-NEXT:    s_mul_i32 s9, s9, s7
-; GFX1030W32-NEXT:    s_mul_hi_u32 s12, s8, s11
-; GFX1030W32-NEXT:    s_add_i32 s9, s13, s9
-; GFX1030W32-NEXT:    s_mul_hi_u32 s13, s7, s11
+; GFX1030W32-NEXT:    s_mul_i32 s10, s10, s8
+; GFX1030W32-NEXT:    s_add_i32 s9, s11, s9
+; GFX1030W32-NEXT:    s_mul_i32 s11, s7, s12
 ; GFX1030W32-NEXT:    s_add_i32 s9, s9, s10
-; GFX1030W32-NEXT:    s_mul_i32 s10, s7, s11
+; GFX1030W32-NEXT:    s_mul_hi_u32 s10, s8, s12
 ; GFX1030W32-NEXT:    s_mul_i32 s15, s8, s9
 ; GFX1030W32-NEXT:    s_mul_hi_u32 s14, s8, s9
-; GFX1030W32-NEXT:    s_add_u32 s12, s12, s15
+; GFX1030W32-NEXT:    s_add_u32 s10, s10, s15
+; GFX1030W32-NEXT:    s_mul_hi_u32 s13, s7, s12
 ; GFX1030W32-NEXT:    s_addc_u32 s14, 0, s14
-; GFX1030W32-NEXT:    s_mul_hi_u32 s11, s7, s9
-; GFX1030W32-NEXT:    s_add_u32 s10, s12, s10
+; GFX1030W32-NEXT:    s_mul_hi_u32 s12, s7, s9
+; GFX1030W32-NEXT:    s_add_u32 s10, s10, s11
 ; GFX1030W32-NEXT:    s_mul_i32 s9, s7, s9
 ; GFX1030W32-NEXT:    s_addc_u32 s10, s14, s13
-; GFX1030W32-NEXT:    s_addc_u32 s11, s11, 0
+; GFX1030W32-NEXT:    s_addc_u32 s11, s12, 0
 ; GFX1030W32-NEXT:    s_add_u32 s9, s10, s9
 ; GFX1030W32-NEXT:    s_addc_u32 s10, 0, s11
 ; GFX1030W32-NEXT:    s_add_u32 s8, s8, s9
-; GFX1030W32-NEXT:    s_cselect_b32 s9, -1, 0
-; GFX1030W32-NEXT:    s_mul_hi_u32 s11, s2, s8
-; GFX1030W32-NEXT:    s_cmp_lg_u32 s9, 0
-; GFX1030W32-NEXT:    s_mul_hi_u32 s9, s3, s8
 ; GFX1030W32-NEXT:    s_addc_u32 s7, s7, s10
-; GFX1030W32-NEXT:    s_mul_i32 s8, s3, s8
+; GFX1030W32-NEXT:    s_mul_hi_u32 s9, s2, s8
 ; GFX1030W32-NEXT:    s_mul_i32 s12, s2, s7
-; GFX1030W32-NEXT:    s_mul_hi_u32 s10, s2, s7
-; GFX1030W32-NEXT:    s_add_u32 s11, s11, s12
-; GFX1030W32-NEXT:    s_addc_u32 s10, 0, s10
+; GFX1030W32-NEXT:    s_mul_hi_u32 s11, s2, s7
+; GFX1030W32-NEXT:    s_mul_hi_u32 s10, s3, s8
+; GFX1030W32-NEXT:    s_mul_i32 s8, s3, s8
+; GFX1030W32-NEXT:    s_add_u32 s9, s9, s12
+; GFX1030W32-NEXT:    s_addc_u32 s11, 0, s11
 ; GFX1030W32-NEXT:    s_mul_hi_u32 s13, s3, s7
-; GFX1030W32-NEXT:    s_add_u32 s8, s11, s8
+; GFX1030W32-NEXT:    s_add_u32 s8, s9, s8
 ; GFX1030W32-NEXT:    s_mul_i32 s7, s3, s7
-; GFX1030W32-NEXT:    s_addc_u32 s8, s10, s9
+; GFX1030W32-NEXT:    s_addc_u32 s8, s11, s10
 ; GFX1030W32-NEXT:    s_addc_u32 s9, s13, 0
 ; GFX1030W32-NEXT:    s_add_u32 s7, s8, s7
 ; GFX1030W32-NEXT:    s_addc_u32 s8, 0, s9
@@ -2713,11 +2662,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1030W32-NEXT:    s_sub_i32 s11, s3, s9
 ; GFX1030W32-NEXT:    s_sub_u32 s10, s2, s10
 ; GFX1030W32-NEXT:    s_cselect_b32 s12, -1, 0
-; GFX1030W32-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX1030W32-NEXT:    s_subb_u32 s11, s11, s5
 ; GFX1030W32-NEXT:    s_sub_u32 s13, s10, s4
-; GFX1030W32-NEXT:    s_cselect_b32 s14, -1, 0
-; GFX1030W32-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX1030W32-NEXT:    s_subb_u32 s11, s11, 0
 ; GFX1030W32-NEXT:    s_cmp_ge_u32 s11, s5
 ; GFX1030W32-NEXT:    s_cselect_b32 s14, -1, 0
@@ -2790,8 +2736,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1030W64-NEXT:  ; %bb.1:
 ; GFX1030W64-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; GFX1030W64-NEXT:    v_cvt_f32_u32_e32 v1, s5
-; GFX1030W64-NEXT:    s_sub_u32 s9, 0, s4
-; GFX1030W64-NEXT:    s_subb_u32 s10, 0, s5
+; GFX1030W64-NEXT:    s_sub_u32 s8, 0, s4
+; GFX1030W64-NEXT:    s_subb_u32 s9, 0, s5
 ; GFX1030W64-NEXT:    v_fmamk_f32 v0, v1, 0x4f800000, v0
 ; GFX1030W64-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX1030W64-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2800,109 +2746,102 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1030W64-NEXT:    v_fmamk_f32 v0, v1, 0xcf800000, v0
 ; GFX1030W64-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX1030W64-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX1030W64-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX1030W64-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX1030W64-NEXT:    s_mul_i32 s7, s9, s8
-; GFX1030W64-NEXT:    s_mul_hi_u32 s12, s9, s6
-; GFX1030W64-NEXT:    s_mul_i32 s11, s10, s6
-; GFX1030W64-NEXT:    s_add_i32 s7, s12, s7
-; GFX1030W64-NEXT:    s_mul_i32 s13, s9, s6
-; GFX1030W64-NEXT:    s_add_i32 s7, s7, s11
-; GFX1030W64-NEXT:    s_mul_hi_u32 s12, s6, s13
-; GFX1030W64-NEXT:    s_mul_i32 s15, s6, s7
-; GFX1030W64-NEXT:    s_mul_hi_u32 s14, s8, s13
-; GFX1030W64-NEXT:    s_mul_i32 s11, s8, s13
-; GFX1030W64-NEXT:    s_mul_hi_u32 s13, s6, s7
+; GFX1030W64-NEXT:    v_readfirstlane_b32 s6, v1
+; GFX1030W64-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX1030W64-NEXT:    s_mul_i32 s10, s8, s6
+; GFX1030W64-NEXT:    s_mul_hi_u32 s12, s8, s7
+; GFX1030W64-NEXT:    s_mul_i32 s11, s9, s7
+; GFX1030W64-NEXT:    s_add_i32 s10, s12, s10
+; GFX1030W64-NEXT:    s_mul_i32 s13, s8, s7
+; GFX1030W64-NEXT:    s_add_i32 s10, s10, s11
+; GFX1030W64-NEXT:    s_mul_hi_u32 s12, s7, s13
+; GFX1030W64-NEXT:    s_mul_i32 s15, s7, s10
+; GFX1030W64-NEXT:    s_mul_hi_u32 s14, s6, s13
+; GFX1030W64-NEXT:    s_mul_i32 s11, s6, s13
+; GFX1030W64-NEXT:    s_mul_hi_u32 s13, s7, s10
 ; GFX1030W64-NEXT:    s_add_u32 s12, s12, s15
 ; GFX1030W64-NEXT:    s_addc_u32 s13, 0, s13
-; GFX1030W64-NEXT:    s_mul_hi_u32 s16, s8, s7
+; GFX1030W64-NEXT:    s_mul_hi_u32 s16, s6, s10
 ; GFX1030W64-NEXT:    s_add_u32 s11, s12, s11
-; GFX1030W64-NEXT:    s_mul_i32 s7, s8, s7
+; GFX1030W64-NEXT:    s_mul_i32 s10, s6, s10
 ; GFX1030W64-NEXT:    s_addc_u32 s11, s13, s14
 ; GFX1030W64-NEXT:    s_addc_u32 s12, s16, 0
-; GFX1030W64-NEXT:    s_add_u32 s7, s11, s7
+; GFX1030W64-NEXT:    s_add_u32 s10, s11, s10
 ; GFX1030W64-NEXT:    s_addc_u32 s11, 0, s12
-; GFX1030W64-NEXT:    s_add_u32 s12, s6, s7
-; GFX1030W64-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GFX1030W64-NEXT:    s_mul_hi_u32 s13, s9, s12
-; GFX1030W64-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1030W64-NEXT:    s_mul_i32 s6, s9, s12
-; GFX1030W64-NEXT:    s_addc_u32 s8, s8, s11
-; GFX1030W64-NEXT:    s_mul_i32 s10, s10, s12
-; GFX1030W64-NEXT:    s_mul_i32 s9, s9, s8
-; GFX1030W64-NEXT:    s_mul_hi_u32 s7, s12, s6
-; GFX1030W64-NEXT:    s_add_i32 s9, s13, s9
-; GFX1030W64-NEXT:    s_mul_hi_u32 s11, s8, s6
-; GFX1030W64-NEXT:    s_add_i32 s9, s9, s10
-; GFX1030W64-NEXT:    s_mul_i32 s6, s8, s6
-; GFX1030W64-NEXT:    s_mul_i32 s14, s12, s9
-; GFX1030W64-NEXT:    s_mul_hi_u32 s13, s12, s9
-; GFX1030W64-NEXT:    s_add_u32 s7, s7, s14
+; GFX1030W64-NEXT:    s_add_u32 s7, s7, s10
+; GFX1030W64-NEXT:    s_addc_u32 s6, s6, s11
+; GFX1030W64-NEXT:    s_mul_hi_u32 s10, s8, s7
+; GFX1030W64-NEXT:    s_mul_i32 s11, s8, s7
+; GFX1030W64-NEXT:    s_mul_i32 s8, s8, s6
+; GFX1030W64-NEXT:    s_mul_i32 s9, s9, s7
+; GFX1030W64-NEXT:    s_add_i32 s8, s10, s8
+; GFX1030W64-NEXT:    s_mul_i32 s10, s6, s11
+; GFX1030W64-NEXT:    s_add_i32 s8, s8, s9
+; GFX1030W64-NEXT:    s_mul_hi_u32 s9, s7, s11
+; GFX1030W64-NEXT:    s_mul_i32 s14, s7, s8
+; GFX1030W64-NEXT:    s_mul_hi_u32 s13, s7, s8
+; GFX1030W64-NEXT:    s_add_u32 s9, s9, s14
+; GFX1030W64-NEXT:    s_mul_hi_u32 s12, s6, s11
 ; GFX1030W64-NEXT:    s_addc_u32 s13, 0, s13
-; GFX1030W64-NEXT:    s_mul_hi_u32 s10, s8, s9
-; GFX1030W64-NEXT:    s_add_u32 s6, s7, s6
-; GFX1030W64-NEXT:    s_mul_i32 s9, s8, s9
-; GFX1030W64-NEXT:    s_addc_u32 s6, s13, s11
-; GFX1030W64-NEXT:    s_addc_u32 s7, s10, 0
-; GFX1030W64-NEXT:    s_add_u32 s6, s6, s9
-; GFX1030W64-NEXT:    s_addc_u32 s9, 0, s7
-; GFX1030W64-NEXT:    s_add_u32 s10, s12, s6
-; GFX1030W64-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GFX1030W64-NEXT:    s_mul_hi_u32 s11, s2, s10
-; GFX1030W64-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1030W64-NEXT:    s_mul_hi_u32 s6, s3, s10
-; GFX1030W64-NEXT:    s_addc_u32 s7, s8, s9
-; GFX1030W64-NEXT:    s_mul_i32 s8, s3, s10
-; GFX1030W64-NEXT:    s_mul_i32 s10, s2, s7
-; GFX1030W64-NEXT:    s_mul_hi_u32 s9, s2, s7
-; GFX1030W64-NEXT:    s_add_u32 s10, s11, s10
-; GFX1030W64-NEXT:    s_addc_u32 s9, 0, s9
-; GFX1030W64-NEXT:    s_mul_hi_u32 s12, s3, s7
-; GFX1030W64-NEXT:    s_add_u32 s8, s10, s8
+; GFX1030W64-NEXT:    s_mul_hi_u32 s11, s6, s8
+; GFX1030W64-NEXT:    s_add_u32 s9, s9, s10
+; GFX1030W64-NEXT:    s_mul_i32 s8, s6, s8
+; GFX1030W64-NEXT:    s_addc_u32 s9, s13, s12
+; GFX1030W64-NEXT:    s_addc_u32 s10, s11, 0
+; GFX1030W64-NEXT:    s_add_u32 s8, s9, s8
+; GFX1030W64-NEXT:    s_addc_u32 s9, 0, s10
+; GFX1030W64-NEXT:    s_add_u32 s7, s7, s8
+; GFX1030W64-NEXT:    s_addc_u32 s6, s6, s9
+; GFX1030W64-NEXT:    s_mul_hi_u32 s8, s2, s7
+; GFX1030W64-NEXT:    s_mul_i32 s11, s2, s6
+; GFX1030W64-NEXT:    s_mul_hi_u32 s10, s2, s6
+; GFX1030W64-NEXT:    s_mul_hi_u32 s9, s3, s7
 ; GFX1030W64-NEXT:    s_mul_i32 s7, s3, s7
-; GFX1030W64-NEXT:    s_addc_u32 s6, s9, s6
+; GFX1030W64-NEXT:    s_add_u32 s8, s8, s11
+; GFX1030W64-NEXT:    s_addc_u32 s10, 0, s10
+; GFX1030W64-NEXT:    s_mul_hi_u32 s12, s3, s6
+; GFX1030W64-NEXT:    s_add_u32 s7, s8, s7
+; GFX1030W64-NEXT:    s_mul_i32 s6, s3, s6
+; GFX1030W64-NEXT:    s_addc_u32 s7, s10, s9
 ; GFX1030W64-NEXT:    s_addc_u32 s8, s12, 0
-; GFX1030W64-NEXT:    s_add_u32 s10, s6, s7
+; GFX1030W64-NEXT:    s_add_u32 s10, s7, s6
 ; GFX1030W64-NEXT:    s_addc_u32 s11, 0, s8
 ; GFX1030W64-NEXT:    s_mul_hi_u32 s6, s4, s10
 ; GFX1030W64-NEXT:    s_mul_i32 s7, s4, s11
 ; GFX1030W64-NEXT:    s_mul_i32 s8, s5, s10
 ; GFX1030W64-NEXT:    s_add_i32 s6, s6, s7
-; GFX1030W64-NEXT:    s_add_i32 s12, s6, s8
+; GFX1030W64-NEXT:    s_add_i32 s8, s6, s8
 ; GFX1030W64-NEXT:    s_mul_i32 s6, s4, s10
-; GFX1030W64-NEXT:    s_sub_i32 s8, s3, s12
-; GFX1030W64-NEXT:    s_sub_u32 s13, s2, s6
+; GFX1030W64-NEXT:    s_sub_i32 s9, s3, s8
+; GFX1030W64-NEXT:    s_sub_u32 s12, s2, s6
 ; GFX1030W64-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GFX1030W64-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1030W64-NEXT:    s_subb_u32 s14, s8, s5
-; GFX1030W64-NEXT:    s_sub_u32 s15, s13, s4
-; GFX1030W64-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX1030W64-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; GFX1030W64-NEXT:    s_subb_u32 s8, s14, 0
-; GFX1030W64-NEXT:    s_cmp_ge_u32 s8, s5
-; GFX1030W64-NEXT:    s_cselect_b32 s9, -1, 0
-; GFX1030W64-NEXT:    s_cmp_ge_u32 s15, s4
+; GFX1030W64-NEXT:    s_subb_u32 s9, s9, s5
+; GFX1030W64-NEXT:    s_sub_u32 s13, s12, s4
+; GFX1030W64-NEXT:    s_subb_u32 s9, s9, 0
+; GFX1030W64-NEXT:    s_cmp_ge_u32 s9, s5
 ; GFX1030W64-NEXT:    s_cselect_b32 s14, -1, 0
-; GFX1030W64-NEXT:    s_cmp_eq_u32 s8, s5
-; GFX1030W64-NEXT:    s_cselect_b32 s8, s14, s9
-; GFX1030W64-NEXT:    s_add_u32 s9, s10, 1
+; GFX1030W64-NEXT:    s_cmp_ge_u32 s13, s4
+; GFX1030W64-NEXT:    s_cselect_b32 s13, -1, 0
+; GFX1030W64-NEXT:    s_cmp_eq_u32 s9, s5
+; GFX1030W64-NEXT:    s_cselect_b32 s9, s13, s14
+; GFX1030W64-NEXT:    s_add_u32 s13, s10, 1
 ; GFX1030W64-NEXT:    s_addc_u32 s14, s11, 0
 ; GFX1030W64-NEXT:    s_add_u32 s15, s10, 2
 ; GFX1030W64-NEXT:    s_addc_u32 s16, s11, 0
-; GFX1030W64-NEXT:    s_cmp_lg_u32 s8, 0
-; GFX1030W64-NEXT:    s_cselect_b32 s15, s15, s9
+; GFX1030W64-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX1030W64-NEXT:    s_cselect_b32 s13, s15, s13
 ; GFX1030W64-NEXT:    s_cselect_b32 s14, s16, s14
 ; GFX1030W64-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1030W64-NEXT:    s_subb_u32 s3, s3, s12
+; GFX1030W64-NEXT:    s_subb_u32 s3, s3, s8
 ; GFX1030W64-NEXT:    s_cmp_ge_u32 s3, s5
 ; GFX1030W64-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX1030W64-NEXT:    s_cmp_ge_u32 s13, s4
+; GFX1030W64-NEXT:    s_cmp_ge_u32 s12, s4
 ; GFX1030W64-NEXT:    s_cselect_b32 s7, -1, 0
 ; GFX1030W64-NEXT:    s_cmp_eq_u32 s3, s5
 ; GFX1030W64-NEXT:    s_cselect_b32 s3, s7, s6
 ; GFX1030W64-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1030W64-NEXT:    s_cselect_b32 s7, s14, s11
-; GFX1030W64-NEXT:    s_cselect_b32 s6, s15, s10
+; GFX1030W64-NEXT:    s_cselect_b32 s6, s13, s10
 ; GFX1030W64-NEXT:    s_cbranch_execnz .LBB16_3
 ; GFX1030W64-NEXT:  .LBB16_2:
 ; GFX1030W64-NEXT:    v_cvt_f32_u32_e32 v0, s4
@@ -2988,44 +2927,40 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX11-NEXT:    s_add_u32 s11, s12, s11
 ; GFX11-NEXT:    s_addc_u32 s12, 0, s13
 ; GFX11-NEXT:    s_add_u32 s8, s8, s11
-; GFX11-NEXT:    s_cselect_b32 s11, -1, 0
-; GFX11-NEXT:    s_mul_hi_u32 s13, s9, s8
-; GFX11-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX11-NEXT:    s_mul_i32 s11, s9, s8
 ; GFX11-NEXT:    s_addc_u32 s7, s7, s12
-; GFX11-NEXT:    s_mul_i32 s10, s10, s8
+; GFX11-NEXT:    s_mul_hi_u32 s11, s9, s8
+; GFX11-NEXT:    s_mul_i32 s12, s9, s8
 ; GFX11-NEXT:    s_mul_i32 s9, s9, s7
-; GFX11-NEXT:    s_mul_hi_u32 s12, s8, s11
-; GFX11-NEXT:    s_add_i32 s9, s13, s9
-; GFX11-NEXT:    s_mul_hi_u32 s13, s7, s11
+; GFX11-NEXT:    s_mul_i32 s10, s10, s8
+; GFX11-NEXT:    s_add_i32 s9, s11, s9
+; GFX11-NEXT:    s_mul_i32 s11, s7, s12
 ; GFX11-NEXT:    s_add_i32 s9, s9, s10
-; GFX11-NEXT:    s_mul_i32 s10, s7, s11
+; GFX11-NEXT:    s_mul_hi_u32 s10, s8, s12
 ; GFX11-NEXT:    s_mul_i32 s15, s8, s9
 ; GFX11-NEXT:    s_mul_hi_u32 s14, s8, s9
-; GFX11-NEXT:    s_add_u32 s12, s12, s15
+; GFX11-NEXT:    s_add_u32 s10, s10, s15
+; GFX11-NEXT:    s_mul_hi_u32 s13, s7, s12
 ; GFX11-NEXT:    s_addc_u32 s14, 0, s14
-; GFX11-NEXT:    s_mul_hi_u32 s11, s7, s9
-; GFX11-NEXT:    s_add_u32 s10, s12, s10
+; GFX11-NEXT:    s_mul_hi_u32 s12, s7, s9
+; GFX11-NEXT:    s_add_u32 s10, s10, s11
 ; GFX11-NEXT:    s_mul_i32 s9, s7, s9
 ; GFX11-NEXT:    s_addc_u32 s10, s14, s13
-; GFX11-NEXT:    s_addc_u32 s11, s11, 0
+; GFX11-NEXT:    s_addc_u32 s11, s12, 0
 ; GFX11-NEXT:    s_add_u32 s9, s10, s9
 ; GFX11-NEXT:    s_addc_u32 s10, 0, s11
 ; GFX11-NEXT:    s_add_u32 s8, s8, s9
-; GFX11-NEXT:    s_cselect_b32 s9, -1, 0
-; GFX11-NEXT:    s_mul_hi_u32 s11, s2, s8
-; GFX11-NEXT:    s_cmp_lg_u32 s9, 0
-; GFX11-NEXT:    s_mul_hi_u32 s9, s3, s8
 ; GFX11-NEXT:    s_addc_u32 s7, s7, s10
-; GFX11-NEXT:    s_mul_i32 s8, s3, s8
+; GFX11-NEXT:    s_mul_hi_u32 s9, s2, s8
 ; GFX11-NEXT:    s_mul_i32 s12, s2, s7
-; GFX11-NEXT:    s_mul_hi_u32 s10, s2, s7
-; GFX11-NEXT:    s_add_u32 s11, s11, s12
-; GFX11-NEXT:    s_addc_u32 s10, 0, s10
+; GFX11-NEXT:    s_mul_hi_u32 s11, s2, s7
+; GFX11-NEXT:    s_mul_hi_u32 s10, s3, s8
+; GFX11-NEXT:    s_mul_i32 s8, s3, s8
+; GFX11-NEXT:    s_add_u32 s9, s9, s12
+; GFX11-NEXT:    s_addc_u32 s11, 0, s11
 ; GFX11-NEXT:    s_mul_hi_u32 s13, s3, s7
-; GFX11-NEXT:    s_add_u32 s8, s11, s8
+; GFX11-NEXT:    s_add_u32 s8, s9, s8
 ; GFX11-NEXT:    s_mul_i32 s7, s3, s7
-; GFX11-NEXT:    s_addc_u32 s8, s10, s9
+; GFX11-NEXT:    s_addc_u32 s8, s11, s10
 ; GFX11-NEXT:    s_addc_u32 s9, s13, 0
 ; GFX11-NEXT:    s_add_u32 s7, s8, s7
 ; GFX11-NEXT:    s_addc_u32 s8, 0, s9
@@ -3035,17 +2970,14 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX11-NEXT:    s_add_i32 s9, s9, s10
 ; GFX11-NEXT:    s_mul_i32 s10, s4, s7
 ; GFX11-NEXT:    s_add_i32 s9, s9, s11
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_sub_i32 s11, s3, s9
 ; GFX11-NEXT:    s_sub_u32 s10, s2, s10
 ; GFX11-NEXT:    s_cselect_b32 s12, -1, 0
-; GFX11-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX11-NEXT:    s_subb_u32 s11, s11, s5
 ; GFX11-NEXT:    s_sub_u32 s13, s10, s4
-; GFX11-NEXT:    s_cselect_b32 s14, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX11-NEXT:    s_subb_u32 s11, s11, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_cmp_ge_u32 s11, s5
 ; GFX11-NEXT:    s_cselect_b32 s14, -1, 0
 ; GFX11-NEXT:    s_cmp_ge_u32 s13, s4
@@ -3118,9 +3050,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_or_b64 s[4:5], s[2:3], s[6:7]
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_and_b64 s[4:5], s[4:5], 0xffffffff00000000
-; GFX1250-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX1250-NEXT:    s_cbranch_scc0 .LBB16_4
 ; GFX1250-NEXT:  ; %bb.1:
 ; GFX1250-NEXT:    s_cvt_f32_u32 s4, s6
@@ -3155,12 +3086,9 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_add_nc_u64 s[12:13], s[4:5], s[12:13]
 ; GFX1250-NEXT:    s_add_co_u32 s8, s8, s12
-; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX1250-NEXT:    s_add_co_ci_u32 s9, s9, s13
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_mul_u64 s[10:11], s[10:11], s[8:9]
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_mul_hi_u32 s13, s8, s11
 ; GFX1250-NEXT:    s_mul_i32 s12, s8, s11
 ; GFX1250-NEXT:    s_mul_hi_u32 s4, s8, s10
@@ -3175,19 +3103,17 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_add_nc_u64 s[10:11], s[4:5], s[10:11]
 ; GFX1250-NEXT:    s_add_co_u32 s8, s8, s10
-; GFX1250-NEXT:    s_cselect_b32 s10, -1, 0
-; GFX1250-NEXT:    s_mul_hi_u32 s4, s2, s8
-; GFX1250-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX1250-NEXT:    s_mul_hi_u32 s12, s3, s8
 ; GFX1250-NEXT:    s_add_co_ci_u32 s10, s9, s11
-; GFX1250-NEXT:    s_mul_i32 s11, s3, s8
+; GFX1250-NEXT:    s_mul_hi_u32 s4, s2, s8
+; GFX1250-NEXT:    s_mul_hi_u32 s11, s3, s8
+; GFX1250-NEXT:    s_mul_i32 s12, s3, s8
 ; GFX1250-NEXT:    s_mul_hi_u32 s9, s2, s10
 ; GFX1250-NEXT:    s_mul_i32 s8, s2, s10
 ; GFX1250-NEXT:    s_mul_hi_u32 s13, s3, s10
 ; GFX1250-NEXT:    s_add_nc_u64 s[8:9], s[4:5], s[8:9]
 ; GFX1250-NEXT:    s_mul_i32 s10, s3, s10
-; GFX1250-NEXT:    s_add_co_u32 s4, s8, s11
-; GFX1250-NEXT:    s_add_co_ci_u32 s4, s9, s12
+; GFX1250-NEXT:    s_add_co_u32 s4, s8, s12
+; GFX1250-NEXT:    s_add_co_ci_u32 s4, s9, s11
 ; GFX1250-NEXT:    s_add_co_ci_u32 s11, s13, 0
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_add_nc_u64 s[8:9], s[4:5], s[10:11]
@@ -3202,10 +3128,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1250-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX1250-NEXT:    s_sub_co_ci_u32 s12, s12, s7
 ; GFX1250-NEXT:    s_sub_co_u32 s13, s4, s6
-; GFX1250-NEXT:    s_cselect_b32 s14, -1, 0
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX1250-NEXT:    s_sub_co_ci_u32 s12, s12, 0
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_cmp_ge_u32 s12, s7
 ; GFX1250-NEXT:    s_cselect_b32 s14, -1, 0
 ; GFX1250-NEXT:    s_cmp_ge_u32 s13, s6
diff --git a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
index 57a1e4c..ec92edb 100644
--- a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
@@ -3385,7 +3385,7 @@ declare half @llvm.canonicalize.f16(half)
 declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>)
 
 attributes #0 = { nounwind "amdgpu-ieee"="false" }
-attributes #1 = { nounwind "unsafe-fp-math"="true" "no-nans-fp-math"="true" }
+attributes #1 = { nounwind "no-nans-fp-math"="true" }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX11NONANS-FAKE16: {{.*}}
 ; GFX11NONANS-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 4b151b9..07e6a76 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -714,9 +714,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; VI-NEXT:    s_lshl_b32 s2, s2, 8
 ; VI-NEXT:    s_or_b32 s2, s2, s3
 ; VI-NEXT:    s_lshl_b32 s3, s2, 16
-; VI-NEXT:    s_and_b32 s2, s2, 0xffff
 ; VI-NEXT:    s_flbit_i32_b32 s3, s3
-; VI-NEXT:    s_cmp_lg_u32 s2, 0
+; VI-NEXT:    s_and_b32 s2, s2, 0xffff
 ; VI-NEXT:    s_cselect_b32 s2, s3, 32
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
index cefcbdd..fca57be 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
@@ -1491,7 +1491,6 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_lshr_b32 s4, s6, 16
-; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB14_4
 ; SI-NEXT:  ; %bb.1: ; %else
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
@@ -1521,7 +1520,6 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s4, s6, 16
-; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    s_cbranch_scc0 .LBB14_4
 ; VI-NEXT:  ; %bb.1: ; %else
 ; VI-NEXT:    s_mov_b32 s11, 0xf000
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index 6c8207a..df7f8c6 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -4344,7 +4344,7 @@ define i128 @v_sdiv_i128_v_pow2k(i128 %lhs) {
 ; GFX9-G-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v3, vcc
 ; GFX9-G-NEXT:    v_lshlrev_b64 v[0:1], 31, v[1:2]
 ; GFX9-G-NEXT:    v_lshrrev_b32_e32 v3, 1, v4
-; GFX9-G-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX9-G-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX9-G-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GFX9-G-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
 ; GFX9-G-NEXT:    s_setpc_b64 s[30:31]
@@ -4375,14 +4375,12 @@ define i128 @v_sdiv_i128_v_pow2k(i128 %lhs) {
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9-G-O0-NEXT:    s_mov_b32 s5, 1
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s5
-; GFX9-G-O0-NEXT:    v_lshrrev_b32_e64 v0, v0, v1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[5:6], v2, v[5:6]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v6
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v0, v0, v3
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v1, v1, v2
+; GFX9-G-O0-NEXT:    v_lshrrev_b32_e64 v2, v0, v1
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[5:6], v0, v[5:6]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v6
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v0, v0, v2
 ; GFX9-G-O0-NEXT:    s_mov_b32 s4, 31
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-G-O0-NEXT:    v_ashrrev_i32_e64 v3, v2, v4
@@ -4437,7 +4435,7 @@ define i128 @v_udiv_i128_v_pow2k(i128 %lhs) {
 ; GFX9-G-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-G-NEXT:    v_lshlrev_b64 v[0:1], 31, v[2:3]
 ; GFX9-G-NEXT:    v_lshrrev_b32_e32 v2, 1, v4
-; GFX9-G-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX9-G-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX9-G-NEXT:    v_lshrrev_b32_e32 v2, 1, v3
 ; GFX9-G-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-G-NEXT:    s_setpc_b64 s[30:31]
@@ -4450,15 +4448,13 @@ define i128 @v_udiv_i128_v_pow2k(i128 %lhs) {
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX9-G-O0-NEXT:    s_mov_b32 s4, 1
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-G-O0-NEXT:    v_lshrrev_b32_e64 v0, v0, v1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-G-O0-NEXT:    v_lshrrev_b32_e64 v2, v0, v1
 ; GFX9-G-O0-NEXT:    s_mov_b32 s4, 31
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[5:6], v2, v[4:5]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, v5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v6
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v0, v0, v4
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v1, v1, v2
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[4:5], v0, v[4:5]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v0, v0, v2
 ; GFX9-G-O0-NEXT:    s_mov_b32 s4, 1
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-G-O0-NEXT:    v_lshrrev_b32_e64 v2, v2, v3
diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
index d8a5e7fa..dbdea8e 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -14,7 +14,6 @@ define i32 @s_add_co_select_user() {
 ; GFX7-NEXT:    s_add_u32 s7, s6, s6
 ; GFX7-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX7-NEXT:    s_or_b32 s4, s4, s5
-; GFX7-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX7-NEXT:    s_addc_u32 s8, s6, 0
 ; GFX7-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX7-NEXT:    s_and_b64 s[4:5], s[4:5], exec
@@ -31,8 +30,6 @@ define i32 @s_add_co_select_user() {
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_add_u32 s7, s6, s6
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX9-NEXT:    s_addc_u32 s8, s6, 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
@@ -49,8 +46,6 @@ define i32 @s_add_co_select_user() {
 ; GFX10-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_add_u32 s5, s4, s4
-; GFX10-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX10-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX10-NEXT:    s_addc_u32 s6, s4, 0
 ; GFX10-NEXT:    s_cselect_b32 s7, -1, 0
 ; GFX10-NEXT:    s_and_b32 s7, s7, exec_lo
@@ -67,16 +62,13 @@ define i32 @s_add_co_select_user() {
 ; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_add_u32 s1, s0, s0
-; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX11-NEXT:    s_addc_u32 s2, s0, 0
 ; GFX11-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s3, s3, exec_lo
 ; GFX11-NEXT:    s_cselect_b32 s2, s2, 0
 ; GFX11-NEXT:    s_cmp_gt_u32 s0, 31
 ; GFX11-NEXT:    s_cselect_b32 s0, s1, s2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
@@ -104,7 +96,6 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ; GFX7-NEXT:    s_add_u32 s0, s2, s2
 ; GFX7-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
-; GFX7-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX7-NEXT:    s_addc_u32 s0, s2, 0
 ; GFX7-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX7-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
@@ -125,12 +116,10 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ;
 ; GFX9-LABEL: s_add_co_br_user:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x0
+; GFX9-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s0, s2, s2
-; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    s_addc_u32 s0, s2, 0
+; GFX9-NEXT:    s_add_u32 s1, s0, s0
+; GFX9-NEXT:    s_addc_u32 s0, s0, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
 ; GFX9-NEXT:    s_cbranch_vccnz .LBB1_2
@@ -153,8 +142,6 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ; GFX10-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_add_u32 s1, s0, s0
-; GFX10-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX10-NEXT:    s_addc_u32 s0, s0, 0
 ; GFX10-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX10-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s0
@@ -178,11 +165,9 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_add_u32 s1, s0, s0
-; GFX11-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11-NEXT:    s_addc_u32 s0, s0, 0
 ; GFX11-NEXT:    s_cselect_b32 s0, -1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_vccnz .LBB1_2
 ; GFX11-NEXT:  ; %bb.1: ; %bb0
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index 62847b1..9a17538 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -1117,7 +1117,6 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_and_b32 s3, s1, 0x1ff
 ; SI-NEXT:    s_or_b32 s0, s3, s0
-; SI-NEXT:    s_cmp_lg_u32 s0, 0
 ; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; SI-NEXT:    s_lshr_b32 s0, s1, 8
@@ -1169,7 +1168,6 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_and_b32 s3, s1, 0x1ff
 ; VI-NEXT:    s_or_b32 s0, s3, s0
-; VI-NEXT:    s_cmp_lg_u32 s0, 0
 ; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; VI-NEXT:    s_lshr_b32 s0, s1, 8
@@ -1217,7 +1215,6 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_and_b32 s3, s1, 0x1ff
 ; GFX9-NEXT:    s_or_b32 s0, s3, s0
-; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9-NEXT:    s_lshr_b32 s0, s1, 8
@@ -1264,11 +1261,9 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
 ; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_and_b32 s3, s1, 0x1ff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, s3, s0
-; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX11-TRUE16-NEXT:    s_cselect_b32 s0, -1, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-TRUE16-NEXT:    s_bfe_u32 s0, s1, 0xb0014
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s1, s1, 8
@@ -1320,11 +1315,9 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
 ; GFX11-FAKE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_and_b32 s3, s1, 0x1ff
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s3, s0
-; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX11-FAKE16-NEXT:    s_cselect_b32 s0, -1, 0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-FAKE16-NEXT:    s_bfe_u32 s0, s1, 0xb0014
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s1, s1, 8
@@ -4023,7 +4016,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; SI-NEXT:    s_and_b32 s6, s4, 0xffe
 ; SI-NEXT:    s_and_b32 s4, s1, 0x1ff
 ; SI-NEXT:    s_or_b32 s0, s4, s0
-; SI-NEXT:    s_cmp_lg_u32 s0, 0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, s5
 ; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
@@ -4066,7 +4058,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; SI-NEXT:    s_and_b32 s5, s0, 0xffe
 ; SI-NEXT:    s_and_b32 s0, s3, 0x1ff
 ; SI-NEXT:    s_or_b32 s0, s0, s2
-; SI-NEXT:    s_cmp_lg_u32 s0, 0
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; SI-NEXT:    v_readfirstlane_b32 s0, v2
@@ -4120,10 +4111,9 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; VI-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_lshr_b32 s5, s3, 8
-; VI-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; VI-NEXT:    s_and_b32 s5, s5, 0xffe
+; VI-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; VI-NEXT:    s_or_b32 s2, s6, s2
-; VI-NEXT:    s_cmp_lg_u32 s2, 0
 ; VI-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[6:7]
 ; VI-NEXT:    s_bfe_u32 s3, s3, 0xb0014
@@ -4163,7 +4153,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; VI-NEXT:    s_and_b32 s7, s2, 0xffe
 ; VI-NEXT:    s_and_b32 s2, s1, 0x1ff
 ; VI-NEXT:    s_or_b32 s0, s2, s0
-; VI-NEXT:    s_cmp_lg_u32 s0, 0
 ; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
 ; VI-NEXT:    s_bfe_u32 s1, s1, 0xb0014
@@ -4209,10 +4198,9 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; GFX9-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX9-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX9-NEXT:    s_and_b32 s5, s5, 0xffe
+; GFX9-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX9-NEXT:    s_or_b32 s2, s6, s2
-; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[6:7]
 ; GFX9-NEXT:    s_bfe_u32 s6, s3, 0xb0014
@@ -4254,7 +4242,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; GFX9-NEXT:    s_and_b32 s6, s2, 0xffe
 ; GFX9-NEXT:    s_and_b32 s2, s1, 0x1ff
 ; GFX9-NEXT:    s_or_b32 s0, s2, s0
-; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
@@ -4301,11 +4288,10 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ;
 ; GFX11-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_b32 s5, s3, 0x1ff
-; GFX11-NEXT:    s_lshr_b32 s6, s3, 8
-; GFX11-NEXT:    s_or_b32 s2, s5, s2
-; GFX11-NEXT:    s_and_b32 s5, s6, 0xffe
-; GFX11-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-NEXT:    s_lshr_b32 s5, s3, 8
+; GFX11-NEXT:    s_and_b32 s6, s3, 0x1ff
+; GFX11-NEXT:    s_and_b32 s5, s5, 0xffe
+; GFX11-NEXT:    s_or_b32 s2, s6, s2
 ; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
@@ -4348,13 +4334,12 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; GFX11-NEXT:    s_cmpk_eq_i32 s2, 0x40f
 ; GFX11-NEXT:    s_cselect_b32 s2, s5, s6
 ; GFX11-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX11-NEXT:    s_and_b32 s6, s1, 0x1ff
 ; GFX11-NEXT:    s_lshr_b32 s5, s1, 8
 ; GFX11-NEXT:    s_and_b32 s3, s3, 0x8000
-; GFX11-NEXT:    s_or_b32 s0, s6, s0
+; GFX11-NEXT:    s_and_b32 s6, s1, 0x1ff
 ; GFX11-NEXT:    s_and_b32 s5, s5, 0xffe
 ; GFX11-NEXT:    s_or_b32 s2, s3, s2
-; GFX11-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-NEXT:    s_or_b32 s0, s6, s0
 ; GFX11-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll
index acb32d4..11476a6 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll
@@ -127,7 +127,7 @@ define amdgpu_kernel void @s_fdiv_v4f64(ptr addrspace(1) %out, <4 x double> %num
 ; GCN-LABEL: {{^}}div_fast_2_x_pat_f64:
 ; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0.5
 ; GCN: buffer_store_dwordx2 [[MUL]]
-define amdgpu_kernel void @div_fast_2_x_pat_f64(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @div_fast_2_x_pat_f64(ptr addrspace(1) %out) #0 {
   %x = load double, ptr addrspace(1) poison
   %rcp = fdiv fast double %x, 2.0
   store double %rcp, ptr addrspace(1) %out, align 4
@@ -139,7 +139,7 @@ define amdgpu_kernel void @div_fast_2_x_pat_f64(ptr addrspace(1) %out) #1 {
 ; GCN-DAG: v_mov_b32_e32 v[[K_HI:[0-9]+]], 0x3fb99999
 ; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, v[[[K_LO]]:[[K_HI]]]
 ; GCN: buffer_store_dwordx2 [[MUL]]
-define amdgpu_kernel void @div_fast_k_x_pat_f64(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @div_fast_k_x_pat_f64(ptr addrspace(1) %out) #0 {
   %x = load double, ptr addrspace(1) poison
   %rcp = fdiv fast double %x, 10.0
   store double %rcp, ptr addrspace(1) %out, align 4
@@ -151,7 +151,7 @@ define amdgpu_kernel void @div_fast_k_x_pat_f64(ptr addrspace(1) %out) #1 {
 ; GCN-DAG: v_mov_b32_e32 v[[K_HI:[0-9]+]], 0xbfb99999
 ; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, v[[[K_LO]]:[[K_HI]]]
 ; GCN: buffer_store_dwordx2 [[MUL]]
-define amdgpu_kernel void @div_fast_neg_k_x_pat_f64(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @div_fast_neg_k_x_pat_f64(ptr addrspace(1) %out) #0 {
   %x = load double, ptr addrspace(1) poison
   %rcp = fdiv fast double %x, -10.0
   store double %rcp, ptr addrspace(1) %out, align 4
@@ -159,4 +159,3 @@ define amdgpu_kernel void @div_fast_neg_k_x_pat_f64(ptr addrspace(1) %out) #1 {
 }
 
 attributes #0 = { nounwind }
-attributes #1 = { nounwind "unsafe-fp-math"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
index b5b2655..31344c7 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
@@ -2080,21 +2080,13 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_or_i64_imm_offset_16(ptr addrs
 }
 
 define amdgpu_ps float @flat_load_saddr_i8_offset_or_i64_imm_offset_4160(ptr addrspace(6) inreg %sbase, i32 %idx) {
-; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_or_i64_imm_offset_4160:
-; GFX1250-SDAG:       ; %bb.0:
-; GFX1250-SDAG-NEXT:    v_or_b32_e32 v0, 0x1040, v0
-; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-SDAG-NEXT:    ; return to shader part epilog
-;
-; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_or_i64_imm_offset_4160:
-; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1250-GISEL-NEXT:    v_or_b32_e32 v0, 0x1040, v0
-; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+; GFX1250-LABEL: flat_load_saddr_i8_offset_or_i64_imm_offset_4160:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_or_b32_e32 v0, 0x1040, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
   %zext.idx = zext i32 %idx to i64
   %or = or i64 %zext.idx, 4160
   %addr = inttoptr i64 %or to ptr
diff --git a/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll b/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll
index 92eb4a6..0a266bc 100644
--- a/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll
@@ -284,4 +284,4 @@ define <2 x float> @unsafe_fast_fmul_fsub_ditribute_post_legalize(float %arg0, <
   ret <2 x float> %tmp1
 }
 
-attributes #0 = { "no-infs-fp-math"="true" "unsafe-fp-math"="true" }
+attributes #0 = { "no-infs-fp-math"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum.ll b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
index e59fbad..62ec010 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
@@ -1,117 +1,296 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
 
 define amdgpu_ps float @test_fmaximum_f32_vv(float %a, float %b) {
-; GCN-LABEL: test_fmaximum_f32_vv:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f32 v0, v0, v1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_f32_vv:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f32_vv:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call float @llvm.maximum.f32(float %a, float %b)
   ret float %val
 }
 
 define amdgpu_ps float @test_fmaximum_f32_ss(float inreg %a, float inreg %b) {
-; GCN-LABEL: test_fmaximum_f32_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_maximum_f32 s0, s0, s1
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_f32_ss:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_max_f32_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f32_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_maximum_f32 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call float @llvm.maximum.f32(float %a, float %b)
   ret float %val
 }
 
 define amdgpu_ps float @test_fmaximum_f32_vs(float %a, float inreg %b) {
-; GCN-LABEL: test_fmaximum_f32_vs:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f32 v0, v0, s0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_f32_vs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f32_vs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call float @llvm.maximum.f32(float %a, float %b)
   ret float %val
 }
 
 define amdgpu_ps float @test_fmaximum_nnan_f32(float %a, float %b) {
-; GCN-LABEL: test_fmaximum_nnan_f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f32 v0, v0, v1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_nnan_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_nnan_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call nnan float @llvm.maximum.f32(float %a, float %b)
   ret float %val
 }
 
+define amdgpu_ps float @test_fmaximum_nsz_f32(float %a, float %b) {
+; GFX9-LABEL: test_fmaximum_nsz_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_nsz_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
+  %val = call nsz float @llvm.maximum.f32(float %a, float %b)
+  ret float %val
+}
+
+define amdgpu_ps float @test_fmaximum_signed_zero_f32() {
+; GFX9-LABEL: test_fmaximum_signed_zero_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_signed_zero_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    ; return to shader part epilog
+  %val = call float @llvm.maximum.f32(float -0.0, float 0.0)
+  ret float %val
+}
+
 define amdgpu_ps <2 x float> @test_fmaximum_v2f32(<2 x float> %a, <2 x float> %b) {
-; GCN-LABEL: test_fmaximum_v2f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f32 v0, v0, v2
-; GCN-NEXT:    v_maximum_f32 v1, v1, v3
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_v2f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-NEXT:    v_max_f32_e32 v2, v1, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v2f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v2
+; GFX12-NEXT:    v_maximum_f32 v1, v1, v3
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b)
   ret <2 x float> %val
 }
 
 define amdgpu_ps <2 x float> @test_fmaximum_v2f32_ss(<2 x float> inreg %a, <2 x float> inreg %b) {
-; GCN-LABEL: test_fmaximum_v2f32_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_maximum_f32 s0, s0, s2
-; GCN-NEXT:    s_maximum_f32 s1, s1, s3
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
-; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_v2f32_ss:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_max_f32_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_max_f32_e32 v3, s1, v1
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v2f32_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_maximum_f32 s0, s0, s2
+; GFX12-NEXT:    s_maximum_f32 s1, s1, s3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b)
   ret <2 x float> %val
 }
 
 define amdgpu_ps <3 x float> @test_fmaximum_v3f32(<3 x float> %a, <3 x float> %b) {
-; GCN-LABEL: test_fmaximum_v3f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f32 v0, v0, v3
-; GCN-NEXT:    v_maximum_f32 v1, v1, v4
-; GCN-NEXT:    v_maximum_f32 v2, v2, v5
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_v3f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v6, v0, v3
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_max_f32_e32 v3, v1, v4
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX9-NEXT:    v_max_f32_e32 v3, v2, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v3f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v3
+; GFX12-NEXT:    v_maximum_f32 v1, v1, v4
+; GFX12-NEXT:    v_maximum_f32 v2, v2, v5
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> %b)
   ret <3 x float> %val
 }
 
 define amdgpu_ps <4 x float> @test_fmaximum_v4f32(<4 x float> %a, <4 x float> %b) {
-; GCN-LABEL: test_fmaximum_v4f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f32 v0, v0, v4
-; GCN-NEXT:    v_maximum_f32 v1, v1, v5
-; GCN-NEXT:    v_maximum_f32 v2, v2, v6
-; GCN-NEXT:    v_maximum_f32 v3, v3, v7
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_v4f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v8, v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
+; GFX9-NEXT:    v_max_f32_e32 v4, v1, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
+; GFX9-NEXT:    v_max_f32_e32 v4, v2, v6
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
+; GFX9-NEXT:    v_max_f32_e32 v4, v3, v7
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v4f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v4
+; GFX12-NEXT:    v_maximum_f32 v1, v1, v5
+; GFX12-NEXT:    v_maximum_f32 v2, v2, v6
+; GFX12-NEXT:    v_maximum_f32 v3, v3, v7
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x float> @llvm.maximum.v4f32(<4 x float> %a, <4 x float> %b)
   ret <4 x float> %val
 }
 
 define amdgpu_ps <16 x float> @test_fmaximum_v16f32(<16 x float> %a, <16 x float> %b) {
-; GCN-LABEL: test_fmaximum_v16f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f32 v0, v0, v16
-; GCN-NEXT:    v_maximum_f32 v1, v1, v17
-; GCN-NEXT:    v_maximum_f32 v2, v2, v18
-; GCN-NEXT:    v_maximum_f32 v3, v3, v19
-; GCN-NEXT:    v_maximum_f32 v4, v4, v20
-; GCN-NEXT:    v_maximum_f32 v5, v5, v21
-; GCN-NEXT:    v_maximum_f32 v6, v6, v22
-; GCN-NEXT:    v_maximum_f32 v7, v7, v23
-; GCN-NEXT:    v_maximum_f32 v8, v8, v24
-; GCN-NEXT:    v_maximum_f32 v9, v9, v25
-; GCN-NEXT:    v_maximum_f32 v10, v10, v26
-; GCN-NEXT:    v_maximum_f32 v11, v11, v27
-; GCN-NEXT:    v_maximum_f32 v12, v12, v28
-; GCN-NEXT:    v_maximum_f32 v13, v13, v29
-; GCN-NEXT:    v_maximum_f32 v14, v14, v30
-; GCN-NEXT:    v_maximum_f32 v15, v15, v31
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_v16f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v32, v1, v17
+; GFX9-NEXT:    v_mov_b32_e32 v33, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v17
+; GFX9-NEXT:    v_max_f32_e32 v1, v0, v16
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[12:13], v0, v16
+; GFX9-NEXT:    v_max_f32_e32 v17, v2, v18
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[0:1], v2, v18
+; GFX9-NEXT:    v_max_f32_e32 v18, v3, v19
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[2:3], v3, v19
+; GFX9-NEXT:    v_max_f32_e32 v19, v4, v20
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], v4, v20
+; GFX9-NEXT:    v_max_f32_e32 v20, v5, v21
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[6:7], v5, v21
+; GFX9-NEXT:    v_max_f32_e32 v21, v6, v22
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[8:9], v6, v22
+; GFX9-NEXT:    v_max_f32_e32 v22, v7, v23
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[10:11], v7, v23
+; GFX9-NEXT:    v_max_f32_e32 v23, v8, v24
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v33, v1, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v33, v32, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v8, v24
+; GFX9-NEXT:    v_max_f32_e32 v34, v9, v25
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v33, v23, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v9, v25
+; GFX9-NEXT:    v_max_f32_e32 v35, v10, v26
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v33, v34, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v10, v26
+; GFX9-NEXT:    v_max_f32_e32 v36, v11, v27
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v33, v35, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v11, v27
+; GFX9-NEXT:    v_max_f32_e32 v37, v12, v28
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v33, v36, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v12, v28
+; GFX9-NEXT:    v_max_f32_e32 v16, v13, v29
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v33, v37, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v13, v29
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v33, v16, vcc
+; GFX9-NEXT:    v_max_f32_e32 v16, v14, v30
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v14, v30
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v33, v16, vcc
+; GFX9-NEXT:    v_max_f32_e32 v16, v15, v31
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v15, v31
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v33, v17, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v33, v18, s[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v33, v19, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v33, v20, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v33, v21, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v33, v22, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v33, v16, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v16f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v16
+; GFX12-NEXT:    v_maximum_f32 v1, v1, v17
+; GFX12-NEXT:    v_maximum_f32 v2, v2, v18
+; GFX12-NEXT:    v_maximum_f32 v3, v3, v19
+; GFX12-NEXT:    v_maximum_f32 v4, v4, v20
+; GFX12-NEXT:    v_maximum_f32 v5, v5, v21
+; GFX12-NEXT:    v_maximum_f32 v6, v6, v22
+; GFX12-NEXT:    v_maximum_f32 v7, v7, v23
+; GFX12-NEXT:    v_maximum_f32 v8, v8, v24
+; GFX12-NEXT:    v_maximum_f32 v9, v9, v25
+; GFX12-NEXT:    v_maximum_f32 v10, v10, v26
+; GFX12-NEXT:    v_maximum_f32 v11, v11, v27
+; GFX12-NEXT:    v_maximum_f32 v12, v12, v28
+; GFX12-NEXT:    v_maximum_f32 v13, v13, v29
+; GFX12-NEXT:    v_maximum_f32 v14, v14, v30
+; GFX12-NEXT:    v_maximum_f32 v15, v15, v31
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <16 x float> @llvm.maximum.v16f32(<16 x float> %a, <16 x float> %b)
   ret <16 x float> %val
 }
 
 define amdgpu_ps half @test_fmaximum_f16_vv(half %a, half %b) {
+; GFX9-LABEL: test_fmaximum_f16_vv:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
 ; GFX12-SDAG-TRUE16-LABEL: test_fmaximum_f16_vv:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    v_maximum_f16 v0.l, v0.l, v1.l
@@ -136,35 +315,131 @@ define amdgpu_ps half @test_fmaximum_f16_vv(half %a, half %b) {
 }
 
 define amdgpu_ps half @test_fmaximum_f16_ss(half inreg %a, half inreg %b) {
-; GCN-LABEL: test_fmaximum_f16_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_maximum_f16 s0, s0, s1
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_f16_ss:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_max_f16_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f16_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_maximum_f16 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call half @llvm.maximum.f16(half %a, half %b)
   ret half %val
 }
 
 define amdgpu_ps <2 x half> @test_fmaximum_v2f16_vv(<2 x half> %a, <2 x half> %b) {
-; GCN-LABEL: test_fmaximum_v2f16_vv:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_pk_maximum_f16 v0, v0, v1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_v2f16_vv:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v2, v0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-SDAG-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v2f16_vv:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v0, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], v0, v1
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v2f16_vv:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
   ret <2 x half> %val
 }
 
 define amdgpu_ps <2 x half> @test_fmaximum_v2f16_ss(<2 x half> inreg %a, <2 x half> inreg %b) {
-; GCN-LABEL: test_fmaximum_v2f16_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_pk_maximum_f16 v0, s0, s1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_v2f16_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, s0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SDAG-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v2f16_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-GISEL-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX9-GISEL-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, s0, v0
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s2, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], s0, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v2f16_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_pk_maximum_f16 v0, s0, s1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
   ret <2 x half> %val
 }
 
 define amdgpu_ps <3 x half> @test_fmaximum_v3f16_vv(<3 x half> %a, <3 x half> %b) {
+; GFX9-SDAG-LABEL: test_fmaximum_v3f16_vv:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v4, v1, v3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v3, v0, v2
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-SDAG-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v3f16_vv:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v4, v0, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], v0, v2
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v5, v4, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v2, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v4, v1, v3
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
 ; GFX12-SDAG-LABEL: test_fmaximum_v3f16_vv:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    v_pk_maximum_f16 v0, v0, v2
@@ -187,6 +462,49 @@ define amdgpu_ps <3 x half> @test_fmaximum_v3f16_vv(<3 x half> %a, <3 x half> %b
 }
 
 define amdgpu_ps <3 x half> @test_fmaximum_v3f16_ss(<3 x half> inreg %a, <3 x half> inreg %b) {
+; GFX9-SDAG-LABEL: test_fmaximum_v3f16_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, s1, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s1, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-SDAG-NEXT:    s_lshr_b32 s1, s2, 16
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v3, s0, v3
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v4
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v2, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SDAG-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v3f16_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_lshr_b32 s5, s2, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, s0, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s4, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, s1, v1
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s1, v1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
 ; GFX12-SDAG-LABEL: test_fmaximum_v3f16_ss:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    v_pk_maximum_f16 v0, s0, s2
@@ -206,97 +524,384 @@ define amdgpu_ps <3 x half> @test_fmaximum_v3f16_ss(<3 x half> inreg %a, <3 x ha
 }
 
 define amdgpu_ps <4 x half> @test_fmaximum_v4f16(<4 x half> %a, <4 x half> %b) {
-; GCN-LABEL: test_fmaximum_v4f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_pk_maximum_f16 v0, v0, v2
-; GCN-NEXT:    v_pk_maximum_f16 v1, v1, v3
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_v4f16:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v4, v1, v3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v1, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v3, v0, v2
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-SDAG-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX9-SDAG-NEXT:    v_perm_b32 v1, v1, v6, s0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v4f16:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v4, v0, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v6, v5, vcc
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v4
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v1, v3
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], v1, v3
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, v6, v2, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v2, v6, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v4f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b)
   ret <4 x half> %val
 }
 
 define amdgpu_ps <4 x half> @test_fmaximum_v4f16_ss(<4 x half> inreg %a, <4 x half> inreg %b) {
-; GCN-LABEL: test_fmaximum_v4f16_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_pk_maximum_f16 v0, s0, s2
-; GCN-NEXT:    v_pk_maximum_f16 v1, s1, s3
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_v4f16_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, s1, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s1, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v3, v2, v1, vcc
+; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s1, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, s2
+; GFX9-SDAG-NEXT:    s_lshr_b32 s1, s2, 16
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v4, s0, v4
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, s1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v5
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v2, v2, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SDAG-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX9-SDAG-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v4f16_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, s0, v0
+; GFX9-GISEL-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s4, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    s_lshr_b32 s2, s3, 16
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT:    s_lshr_b32 s0, s1, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, s1, v1
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s0, v3
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], s1, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, v2, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v2, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v4f16_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_pk_maximum_f16 v0, s0, s2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, s1, s3
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b)
   ret <4 x half> %val
 }
 
 define amdgpu_ps <2 x float> @test_fmaximum_f64_vv(double %a, double %b) {
-; GCN-LABEL: test_fmaximum_f64_vv:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_f64_vv:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_f64_vv:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f64_vv:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call double @llvm.maximum.f64(double %a, double %b)
   %ret = bitcast double %val to <2 x float>
   ret <2 x float> %ret
 }
 
 define amdgpu_ps <2 x float> @test_fmaximum_f64_ss(double inreg %a, double inreg %b) {
-; GCN-LABEL: test_fmaximum_f64_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f64 v[0:1], s[0:1], s[2:3]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_f64_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT:    v_max_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_f64_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT:    v_max_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f64_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f64 v[0:1], s[0:1], s[2:3]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call double @llvm.maximum.f64(double %a, double %b)
   %ret = bitcast double %val to <2 x float>
   ret <2 x float> %ret
 }
 
 define amdgpu_ps <4 x float> @test_fmaximum_v2f64_ss(<2 x double> inreg %a, <2 x double> inreg %b) {
-; GCN-LABEL: test_fmaximum_v2f64_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f64 v[0:1], s[0:1], s[4:5]
-; GCN-NEXT:    v_maximum_f64 v[2:3], s[2:3], s[6:7]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_v2f64_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-SDAG-NEXT:    v_max_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-SDAG-NEXT:    v_max_f64 v[4:5], s[2:3], v[0:1]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[0:1], s[2:3], v[0:1]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v4, 0, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[0:1]
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v2f64_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    v_max_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-GISEL-NEXT:    v_max_f64 v[4:5], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[0:1], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v3, v6, v5, s[0:1]
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v2f64_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f64 v[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    v_maximum_f64 v[2:3], s[2:3], s[6:7]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x double> @llvm.maximum.v2f64(<2 x double> %a, <2 x double> %b)
   %ret = bitcast <2 x double> %val to <4 x float>
   ret <4 x float> %ret
 }
 
 define amdgpu_ps <8 x float> @test_fmaximum_v4f64(<4 x double> %a, <4 x double> %b) {
-; GCN-LABEL: test_fmaximum_v4f64:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[8:9]
-; GCN-NEXT:    v_maximum_f64 v[2:3], v[2:3], v[10:11]
-; GCN-NEXT:    v_maximum_f64 v[4:5], v[4:5], v[12:13]
-; GCN-NEXT:    v_maximum_f64 v[6:7], v[6:7], v[14:15]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_v4f64:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_max_f64 v[16:17], v[0:1], v[8:9]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
+; GFX9-SDAG-NEXT:    v_max_f64 v[8:9], v[2:3], v[10:11]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[0:1], v[2:3], v[10:11]
+; GFX9-SDAG-NEXT:    v_max_f64 v[10:11], v[4:5], v[12:13]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[2:3], v[4:5], v[12:13]
+; GFX9-SDAG-NEXT:    v_max_f64 v[12:13], v[6:7], v[14:15]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[4:5], v[6:7], v[14:15]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v17, v7, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v8, 0, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v9, v7, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v4, v10, 0, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v5, v11, v7, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v12, 0, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v13, v7, s[4:5]
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v4f64:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_max_f64 v[16:17], v[0:1], v[8:9]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, v[0:1], v[8:9]
+; GFX9-GISEL-NEXT:    v_max_f64 v[8:9], v[2:3], v[10:11]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[0:1], v[2:3], v[10:11]
+; GFX9-GISEL-NEXT:    v_max_f64 v[10:11], v[4:5], v[12:13]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[2:3], v[4:5], v[12:13]
+; GFX9-GISEL-NEXT:    v_max_f64 v[12:13], v[6:7], v[14:15]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[4:5], v[6:7], v[14:15]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v18, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v16, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v18, v17, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v8, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v3, v18, v9, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s[2:3]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v5, v18, v11, s[2:3]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v7, v18, v13, s[4:5]
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v4f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[8:9]
+; GFX12-NEXT:    v_maximum_f64 v[2:3], v[2:3], v[10:11]
+; GFX12-NEXT:    v_maximum_f64 v[4:5], v[4:5], v[12:13]
+; GFX12-NEXT:    v_maximum_f64 v[6:7], v[6:7], v[14:15]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x double> @llvm.maximum.v4f64(<4 x double> %a, <4 x double> %b)
   %ret = bitcast <4 x double> %val to <8 x float>
   ret <8 x float> %ret
 }
 
 define amdgpu_ps <8 x float> @test_fmaximum_v4f64_ss(<4 x double> inreg %a, <4 x double> inreg %b) {
-; GCN-LABEL: test_fmaximum_v4f64_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f64 v[0:1], s[0:1], s[8:9]
-; GCN-NEXT:    v_maximum_f64 v[2:3], s[2:3], s[10:11]
-; GCN-NEXT:    v_maximum_f64 v[4:5], s[4:5], s[12:13]
-; GCN-NEXT:    v_maximum_f64 v[6:7], s[6:7], s[14:15]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_v4f64_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-SDAG-NEXT:    v_max_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v10, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s11
+; GFX9-SDAG-NEXT:    v_max_f64 v[4:5], s[2:3], v[1:2]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[0:1], s[2:3], v[1:2]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s12
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s13
+; GFX9-SDAG-NEXT:    v_max_f64 v[6:7], s[4:5], v[1:2]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[2:3], s[4:5], v[1:2]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s15
+; GFX9-SDAG-NEXT:    v_max_f64 v[8:9], s[6:7], v[1:2]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[4:5], s[6:7], v[1:2]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v10, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v4, 0, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v5, v10, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v4, v6, 0, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v5, v7, v10, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v8, 0, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v9, v10, s[4:5]
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v4f64_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-GISEL-NEXT:    v_max_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s10
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s11
+; GFX9-GISEL-NEXT:    v_max_f64 v[4:5], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[0:1], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s12
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s13
+; GFX9-GISEL-NEXT:    v_max_f64 v[6:7], s[4:5], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[2:3], s[4:5], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s14
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s15
+; GFX9-GISEL-NEXT:    v_max_f64 v[8:9], s[6:7], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[4:5], s[6:7], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v10, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v10, v3, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v3, v10, v5, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[2:3]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v5, v10, v7, s[2:3]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v8, s[4:5]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v7, v10, v9, s[4:5]
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v4f64_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f64 v[0:1], s[0:1], s[8:9]
+; GFX12-NEXT:    v_maximum_f64 v[2:3], s[2:3], s[10:11]
+; GFX12-NEXT:    v_maximum_f64 v[4:5], s[4:5], s[12:13]
+; GFX12-NEXT:    v_maximum_f64 v[6:7], s[6:7], s[14:15]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x double> @llvm.maximum.v4f64(<4 x double> %a, <4 x double> %b)
   %ret = bitcast <4 x double> %val to <8 x float>
   ret <8 x float> %ret
 }
 
 define amdgpu_kernel void @fmaximumi_f32_move_to_valu(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
-; GCN-LABEL: fmaximumi_f32_move_to_valu:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_clause 0x1
-; GCN-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GCN-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GCN-NEXT:    s_wait_loadcnt 0x0
-; GCN-NEXT:    global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS
-; GCN-NEXT:    s_wait_loadcnt 0x0
-; GCN-NEXT:    v_maximum_f32 v1, v1, v2
-; GCN-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GCN-NEXT:    s_endpgm
+; GFX9-LABEL: fmaximumi_f32_move_to_valu:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v4, v1, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fmaximumi_f32_move_to_valu:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_maximum_f32 v1, v1, v2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
   %a = load volatile float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
   %v = call float @llvm.maximum.f32(float %a, float %b)
@@ -305,6 +910,23 @@ define amdgpu_kernel void @fmaximumi_f32_move_to_valu(ptr addrspace(1) %out, ptr
 }
 
 define amdgpu_kernel void @fmaximum_f16_move_to_valu(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
+; GFX9-LABEL: fmaximum_f16_move_to_valu:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v4, v1, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
 ; GFX12-SDAG-TRUE16-LABEL: fmaximum_f16_move_to_valu:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_clause 0x1
@@ -371,6 +993,40 @@ define amdgpu_kernel void @fmaximum_f16_move_to_valu(ptr addrspace(1) %out, ptr
   ret void
 }
 
+define amdgpu_ps float @test_fmaximum_f32_ieee_on(float %a, float %b) #0 {
+; GFX9-LABEL: test_fmaximum_f32_ieee_on:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f32_ieee_on:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
+  %val = call float @llvm.maximum.f32(float %a, float %b)
+  ret float %val
+}
+
+define amdgpu_ps float @test_fmaximum_f32_ieee_off(float %a, float %b) #1 {
+; GFX9-LABEL: test_fmaximum_f32_ieee_off:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f32_ieee_off:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
+  %val = call float @llvm.maximum.f32(float %a, float %b)
+  ret float %val
+}
+
 declare float @llvm.maximum.f32(float, float)
 declare <2 x float> @llvm.maximum.v2f32(<2 x float>, <2 x float>)
 declare <3 x float> @llvm.maximum.v3f32(<3 x float>, <3 x float>)
@@ -383,3 +1039,6 @@ declare <4 x half> @llvm.maximum.v4f16(<4 x half>, <4 x half>)
 declare double @llvm.maximum.f64(double, double)
 declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>)
 declare <4 x double> @llvm.maximum.v4f64(<4 x double>, <4 x double>)
+
+attributes #0 = { nounwind "amdgpu-ieee"="true" }
+attributes #1 = { nounwind "amdgpu-ieee"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll b/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll
index bc85dc2..3e513de 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll
@@ -219,8 +219,8 @@ define <2 x bfloat> @v_test_fmed3_r_i_i_v2bf16_minimumnum_maximumnum(<2 x bfloat
 }
 
 attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" }
-attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" }
+attributes #1 = { nounwind "no-nans-fp-math"="false" }
+attributes #2 = { nounwind "no-nans-fp-math"="true" }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX11: {{.*}}
 ; GFX11-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index 3145a27..60ac0b9 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -8905,4 +8905,4 @@ declare half @llvm.minnum.f16(half, half) #0
 declare half @llvm.maxnum.f16(half, half) #0
 
 attributes #0 = { nounwind readnone }
-attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" }
+attributes #2 = { nounwind "no-nans-fp-math"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum.ll b/llvm/test/CodeGen/AMDGPU/fminimum.ll
index b25120f..474ac7c 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum.ll
@@ -1,117 +1,296 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
 
 define amdgpu_ps float @test_fminimum_f32_vv(float %a, float %b) {
-; GCN-LABEL: test_fminimum_f32_vv:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f32 v0, v0, v1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_f32_vv:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f32_vv:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call float @llvm.minimum.f32(float %a, float %b)
   ret float %val
 }
 
 define amdgpu_ps float @test_fminimum_f32_ss(float inreg %a, float inreg %b) {
-; GCN-LABEL: test_fminimum_f32_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_minimum_f32 s0, s0, s1
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_f32_ss:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_min_f32_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f32_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_minimum_f32 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call float @llvm.minimum.f32(float %a, float %b)
   ret float %val
 }
 
 define amdgpu_ps float @test_fminimum_f32_vs(float %a, float inreg %b) {
-; GCN-LABEL: test_fminimum_f32_vs:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f32 v0, v0, s0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_f32_vs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f32_vs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call float @llvm.minimum.f32(float %a, float %b)
   ret float %val
 }
 
 define amdgpu_ps float @test_fminimum_nnan_f32(float %a, float %b) {
-; GCN-LABEL: test_fminimum_nnan_f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f32 v0, v0, v1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_nnan_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_nnan_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call nnan float @llvm.minimum.f32(float %a, float %b)
   ret float %val
 }
 
+define amdgpu_ps float @test_fminimum_nsz_f32(float %a, float %b) {
+; GFX9-LABEL: test_fminimum_nsz_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_nsz_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
+  %val = call nsz float @llvm.minimum.f32(float %a, float %b)
+  ret float %val
+}
+
+define amdgpu_ps float @test_fminimum_signed_zero_f32() {
+; GFX9-LABEL: test_fminimum_signed_zero_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_bfrev_b32_e32 v0, 1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_signed_zero_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_bfrev_b32_e32 v0, 1
+; GFX12-NEXT:    ; return to shader part epilog
+  %val = call float @llvm.minimum.f32(float -0.0, float 0.0)
+  ret float %val
+}
+
 define amdgpu_ps <2 x float> @test_fminimum_v2f32(<2 x float> %a, <2 x float> %b) {
-; GCN-LABEL: test_fminimum_v2f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f32 v0, v0, v2
-; GCN-NEXT:    v_minimum_f32 v1, v1, v3
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_v2f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-NEXT:    v_min_f32_e32 v2, v1, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v2f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v2
+; GFX12-NEXT:    v_minimum_f32 v1, v1, v3
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b)
   ret <2 x float> %val
 }
 
 define amdgpu_ps <2 x float> @test_fminimum_v2f32_ss(<2 x float> inreg %a, <2 x float> inreg %b) {
-; GCN-LABEL: test_fminimum_v2f32_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_minimum_f32 s0, s0, s2
-; GCN-NEXT:    s_minimum_f32 s1, s1, s3
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
-; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_v2f32_ss:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_min_f32_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_min_f32_e32 v3, s1, v1
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v2f32_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_minimum_f32 s0, s0, s2
+; GFX12-NEXT:    s_minimum_f32 s1, s1, s3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b)
   ret <2 x float> %val
 }
 
 define amdgpu_ps <3 x float> @test_fminimum_v3f32(<3 x float> %a, <3 x float> %b) {
-; GCN-LABEL: test_fminimum_v3f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f32 v0, v0, v3
-; GCN-NEXT:    v_minimum_f32 v1, v1, v4
-; GCN-NEXT:    v_minimum_f32 v2, v2, v5
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_v3f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v6, v0, v3
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_min_f32_e32 v3, v1, v4
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX9-NEXT:    v_min_f32_e32 v3, v2, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v3f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v3
+; GFX12-NEXT:    v_minimum_f32 v1, v1, v4
+; GFX12-NEXT:    v_minimum_f32 v2, v2, v5
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> %b)
   ret <3 x float> %val
 }
 
 define amdgpu_ps <4 x float> @test_fminimum_v4f32(<4 x float> %a, <4 x float> %b) {
-; GCN-LABEL: test_fminimum_v4f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f32 v0, v0, v4
-; GCN-NEXT:    v_minimum_f32 v1, v1, v5
-; GCN-NEXT:    v_minimum_f32 v2, v2, v6
-; GCN-NEXT:    v_minimum_f32 v3, v3, v7
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_v4f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v8, v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
+; GFX9-NEXT:    v_min_f32_e32 v4, v1, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
+; GFX9-NEXT:    v_min_f32_e32 v4, v2, v6
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
+; GFX9-NEXT:    v_min_f32_e32 v4, v3, v7
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v4f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v4
+; GFX12-NEXT:    v_minimum_f32 v1, v1, v5
+; GFX12-NEXT:    v_minimum_f32 v2, v2, v6
+; GFX12-NEXT:    v_minimum_f32 v3, v3, v7
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x float> @llvm.minimum.v4f32(<4 x float> %a, <4 x float> %b)
   ret <4 x float> %val
 }
 
 define amdgpu_ps <16 x float> @test_fminimum_v16f32(<16 x float> %a, <16 x float> %b) {
-; GCN-LABEL: test_fminimum_v16f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f32 v0, v0, v16
-; GCN-NEXT:    v_minimum_f32 v1, v1, v17
-; GCN-NEXT:    v_minimum_f32 v2, v2, v18
-; GCN-NEXT:    v_minimum_f32 v3, v3, v19
-; GCN-NEXT:    v_minimum_f32 v4, v4, v20
-; GCN-NEXT:    v_minimum_f32 v5, v5, v21
-; GCN-NEXT:    v_minimum_f32 v6, v6, v22
-; GCN-NEXT:    v_minimum_f32 v7, v7, v23
-; GCN-NEXT:    v_minimum_f32 v8, v8, v24
-; GCN-NEXT:    v_minimum_f32 v9, v9, v25
-; GCN-NEXT:    v_minimum_f32 v10, v10, v26
-; GCN-NEXT:    v_minimum_f32 v11, v11, v27
-; GCN-NEXT:    v_minimum_f32 v12, v12, v28
-; GCN-NEXT:    v_minimum_f32 v13, v13, v29
-; GCN-NEXT:    v_minimum_f32 v14, v14, v30
-; GCN-NEXT:    v_minimum_f32 v15, v15, v31
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_v16f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v32, v1, v17
+; GFX9-NEXT:    v_mov_b32_e32 v33, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v17
+; GFX9-NEXT:    v_min_f32_e32 v1, v0, v16
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[12:13], v0, v16
+; GFX9-NEXT:    v_min_f32_e32 v17, v2, v18
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[0:1], v2, v18
+; GFX9-NEXT:    v_min_f32_e32 v18, v3, v19
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[2:3], v3, v19
+; GFX9-NEXT:    v_min_f32_e32 v19, v4, v20
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], v4, v20
+; GFX9-NEXT:    v_min_f32_e32 v20, v5, v21
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[6:7], v5, v21
+; GFX9-NEXT:    v_min_f32_e32 v21, v6, v22
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[8:9], v6, v22
+; GFX9-NEXT:    v_min_f32_e32 v22, v7, v23
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[10:11], v7, v23
+; GFX9-NEXT:    v_min_f32_e32 v23, v8, v24
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v33, v1, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v33, v32, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v8, v24
+; GFX9-NEXT:    v_min_f32_e32 v34, v9, v25
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v33, v23, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v9, v25
+; GFX9-NEXT:    v_min_f32_e32 v35, v10, v26
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v33, v34, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v10, v26
+; GFX9-NEXT:    v_min_f32_e32 v36, v11, v27
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v33, v35, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v11, v27
+; GFX9-NEXT:    v_min_f32_e32 v37, v12, v28
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v33, v36, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v12, v28
+; GFX9-NEXT:    v_min_f32_e32 v16, v13, v29
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v33, v37, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v13, v29
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v33, v16, vcc
+; GFX9-NEXT:    v_min_f32_e32 v16, v14, v30
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v14, v30
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v33, v16, vcc
+; GFX9-NEXT:    v_min_f32_e32 v16, v15, v31
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v15, v31
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v33, v17, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v33, v18, s[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v33, v19, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v33, v20, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v33, v21, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v33, v22, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v33, v16, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v16f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v16
+; GFX12-NEXT:    v_minimum_f32 v1, v1, v17
+; GFX12-NEXT:    v_minimum_f32 v2, v2, v18
+; GFX12-NEXT:    v_minimum_f32 v3, v3, v19
+; GFX12-NEXT:    v_minimum_f32 v4, v4, v20
+; GFX12-NEXT:    v_minimum_f32 v5, v5, v21
+; GFX12-NEXT:    v_minimum_f32 v6, v6, v22
+; GFX12-NEXT:    v_minimum_f32 v7, v7, v23
+; GFX12-NEXT:    v_minimum_f32 v8, v8, v24
+; GFX12-NEXT:    v_minimum_f32 v9, v9, v25
+; GFX12-NEXT:    v_minimum_f32 v10, v10, v26
+; GFX12-NEXT:    v_minimum_f32 v11, v11, v27
+; GFX12-NEXT:    v_minimum_f32 v12, v12, v28
+; GFX12-NEXT:    v_minimum_f32 v13, v13, v29
+; GFX12-NEXT:    v_minimum_f32 v14, v14, v30
+; GFX12-NEXT:    v_minimum_f32 v15, v15, v31
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <16 x float> @llvm.minimum.v16f32(<16 x float> %a, <16 x float> %b)
   ret <16 x float> %val
 }
 
 define amdgpu_ps half @test_fminimum_f16_vv(half %a, half %b) {
+; GFX9-LABEL: test_fminimum_f16_vv:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
 ; GFX12-SDAG-TRUE16-LABEL: test_fminimum_f16_vv:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    v_minimum_f16 v0.l, v0.l, v1.l
@@ -136,35 +315,131 @@ define amdgpu_ps half @test_fminimum_f16_vv(half %a, half %b) {
 }
 
 define amdgpu_ps half @test_fminimum_f16_ss(half inreg %a, half inreg %b) {
-; GCN-LABEL: test_fminimum_f16_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_minimum_f16 s0, s0, s1
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_f16_ss:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_min_f16_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f16_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_minimum_f16 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call half @llvm.minimum.f16(half %a, half %b)
   ret half %val
 }
 
 define amdgpu_ps <2 x half> @test_fminimum_v2f16_vv(<2 x half> %a, <2 x half> %b) {
-; GCN-LABEL: test_fminimum_v2f16_vv:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_pk_minimum_f16 v0, v0, v1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_v2f16_vv:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v2, v0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-SDAG-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v2f16_vv:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v2, v0, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], v0, v1
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v2f16_vv:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b)
   ret <2 x half> %val
 }
 
 define amdgpu_ps <2 x half> @test_fminimum_v2f16_ss(<2 x half> inreg %a, <2 x half> inreg %b) {
-; GCN-LABEL: test_fminimum_v2f16_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_pk_minimum_f16 v0, s0, s1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_v2f16_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v1, s0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SDAG-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v2f16_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-GISEL-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX9-GISEL-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v1, s0, v0
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s2, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], s0, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v2f16_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_pk_minimum_f16 v0, s0, s1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b)
   ret <2 x half> %val
 }
 
 define amdgpu_ps <3 x half> @test_fminimum_v3f16_vv(<3 x half> %a, <3 x half> %b) {
+; GFX9-SDAG-LABEL: test_fminimum_v3f16_vv:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v4, v1, v3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v3, v0, v2
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-SDAG-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v3f16_vv:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v4, v0, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], v0, v2
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v5, v4, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v2, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v4, v1, v3
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
 ; GFX12-SDAG-LABEL: test_fminimum_v3f16_vv:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    v_pk_minimum_f16 v0, v0, v2
@@ -187,6 +462,49 @@ define amdgpu_ps <3 x half> @test_fminimum_v3f16_vv(<3 x half> %a, <3 x half> %b
 }
 
 define amdgpu_ps <3 x half> @test_fminimum_v3f16_ss(<3 x half> inreg %a, <3 x half> inreg %b) {
+; GFX9-SDAG-LABEL: test_fminimum_v3f16_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v1, s1, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s1, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-SDAG-NEXT:    s_lshr_b32 s1, s2, 16
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v3, s0, v3
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v4
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v2, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SDAG-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v3f16_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_lshr_b32 s5, s2, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v1, s0, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s4, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v3, s1, v1
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s1, v1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
 ; GFX12-SDAG-LABEL: test_fminimum_v3f16_ss:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    v_pk_minimum_f16 v0, s0, s2
@@ -206,97 +524,384 @@ define amdgpu_ps <3 x half> @test_fminimum_v3f16_ss(<3 x half> inreg %a, <3 x ha
 }
 
 define amdgpu_ps <4 x half> @test_fminimum_v4f16(<4 x half> %a, <4 x half> %b) {
-; GCN-LABEL: test_fminimum_v4f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_pk_minimum_f16 v0, v0, v2
-; GCN-NEXT:    v_pk_minimum_f16 v1, v1, v3
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_v4f16:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v4, v1, v3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v1, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v3, v0, v2
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-SDAG-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX9-SDAG-NEXT:    v_perm_b32 v1, v1, v6, s0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v4f16:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v4, v0, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v6, v5, vcc
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v4
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v2, v1, v3
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], v1, v3
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, v6, v2, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v2, v6, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v4f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b)
   ret <4 x half> %val
 }
 
 define amdgpu_ps <4 x half> @test_fminimum_v4f16_ss(<4 x half> inreg %a, <4 x half> inreg %b) {
-; GCN-LABEL: test_fminimum_v4f16_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_pk_minimum_f16 v0, s0, s2
-; GCN-NEXT:    v_pk_minimum_f16 v1, s1, s3
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_v4f16_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v1, s1, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s1, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v3, v2, v1, vcc
+; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s1, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, s2
+; GFX9-SDAG-NEXT:    s_lshr_b32 s1, s2, 16
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v4, s0, v4
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, s1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v5
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v2, v2, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SDAG-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX9-SDAG-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v4f16_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v1, s0, v0
+; GFX9-GISEL-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s4, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    s_lshr_b32 s2, s3, 16
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT:    s_lshr_b32 s0, s1, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v2, s1, v1
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s0, v3
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], s1, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, v2, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v2, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v4f16_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_pk_minimum_f16 v0, s0, s2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, s1, s3
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b)
   ret <4 x half> %val
 }
 
 define amdgpu_ps <2 x float> @test_fminimum_f64_vv(double %a, double %b) {
-; GCN-LABEL: test_fminimum_f64_vv:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_f64_vv:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_f64_vv:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f64_vv:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call double @llvm.minimum.f64(double %a, double %b)
   %ret = bitcast double %val to <2 x float>
   ret <2 x float> %ret
 }
 
 define amdgpu_ps <2 x float> @test_fminimum_f64_ss(double inreg %a, double inreg %b) {
-; GCN-LABEL: test_fminimum_f64_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f64 v[0:1], s[0:1], s[2:3]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_f64_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT:    v_min_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_f64_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT:    v_min_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f64_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f64 v[0:1], s[0:1], s[2:3]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call double @llvm.minimum.f64(double %a, double %b)
   %ret = bitcast double %val to <2 x float>
   ret <2 x float> %ret
 }
 
 define amdgpu_ps <4 x float> @test_fminimum_v2f64_ss(<2 x double> inreg %a, <2 x double> inreg %b) {
-; GCN-LABEL: test_fminimum_v2f64_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f64 v[0:1], s[0:1], s[4:5]
-; GCN-NEXT:    v_minimum_f64 v[2:3], s[2:3], s[6:7]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_v2f64_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-SDAG-NEXT:    v_min_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-SDAG-NEXT:    v_min_f64 v[4:5], s[2:3], v[0:1]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[0:1], s[2:3], v[0:1]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v4, 0, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[0:1]
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v2f64_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    v_min_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-GISEL-NEXT:    v_min_f64 v[4:5], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[0:1], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v3, v6, v5, s[0:1]
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v2f64_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f64 v[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    v_minimum_f64 v[2:3], s[2:3], s[6:7]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x double> @llvm.minimum.v2f64(<2 x double> %a, <2 x double> %b)
   %ret = bitcast <2 x double> %val to <4 x float>
   ret <4 x float> %ret
 }
 
 define amdgpu_ps <8 x float> @test_fminimum_v4f64(<4 x double> %a, <4 x double> %b) {
-; GCN-LABEL: test_fminimum_v4f64:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[8:9]
-; GCN-NEXT:    v_minimum_f64 v[2:3], v[2:3], v[10:11]
-; GCN-NEXT:    v_minimum_f64 v[4:5], v[4:5], v[12:13]
-; GCN-NEXT:    v_minimum_f64 v[6:7], v[6:7], v[14:15]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_v4f64:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_min_f64 v[16:17], v[0:1], v[8:9]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
+; GFX9-SDAG-NEXT:    v_min_f64 v[8:9], v[2:3], v[10:11]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[0:1], v[2:3], v[10:11]
+; GFX9-SDAG-NEXT:    v_min_f64 v[10:11], v[4:5], v[12:13]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[2:3], v[4:5], v[12:13]
+; GFX9-SDAG-NEXT:    v_min_f64 v[12:13], v[6:7], v[14:15]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[4:5], v[6:7], v[14:15]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v17, v7, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v8, 0, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v9, v7, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v4, v10, 0, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v5, v11, v7, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v12, 0, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v13, v7, s[4:5]
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v4f64:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_min_f64 v[16:17], v[0:1], v[8:9]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, v[0:1], v[8:9]
+; GFX9-GISEL-NEXT:    v_min_f64 v[8:9], v[2:3], v[10:11]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[0:1], v[2:3], v[10:11]
+; GFX9-GISEL-NEXT:    v_min_f64 v[10:11], v[4:5], v[12:13]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[2:3], v[4:5], v[12:13]
+; GFX9-GISEL-NEXT:    v_min_f64 v[12:13], v[6:7], v[14:15]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[4:5], v[6:7], v[14:15]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v18, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v16, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v18, v17, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v8, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v3, v18, v9, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s[2:3]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v5, v18, v11, s[2:3]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v7, v18, v13, s[4:5]
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v4f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[8:9]
+; GFX12-NEXT:    v_minimum_f64 v[2:3], v[2:3], v[10:11]
+; GFX12-NEXT:    v_minimum_f64 v[4:5], v[4:5], v[12:13]
+; GFX12-NEXT:    v_minimum_f64 v[6:7], v[6:7], v[14:15]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x double> @llvm.minimum.v4f64(<4 x double> %a, <4 x double> %b)
   %ret = bitcast <4 x double> %val to <8 x float>
   ret <8 x float> %ret
 }
 
 define amdgpu_ps <8 x float> @test_fminimum_v4f64_ss(<4 x double> inreg %a, <4 x double> inreg %b) {
-; GCN-LABEL: test_fminimum_v4f64_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f64 v[0:1], s[0:1], s[8:9]
-; GCN-NEXT:    v_minimum_f64 v[2:3], s[2:3], s[10:11]
-; GCN-NEXT:    v_minimum_f64 v[4:5], s[4:5], s[12:13]
-; GCN-NEXT:    v_minimum_f64 v[6:7], s[6:7], s[14:15]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_v4f64_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-SDAG-NEXT:    v_min_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v10, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s11
+; GFX9-SDAG-NEXT:    v_min_f64 v[4:5], s[2:3], v[1:2]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[0:1], s[2:3], v[1:2]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s12
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s13
+; GFX9-SDAG-NEXT:    v_min_f64 v[6:7], s[4:5], v[1:2]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[2:3], s[4:5], v[1:2]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s15
+; GFX9-SDAG-NEXT:    v_min_f64 v[8:9], s[6:7], v[1:2]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[4:5], s[6:7], v[1:2]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v10, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v4, 0, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v5, v10, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v4, v6, 0, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v5, v7, v10, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v8, 0, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v9, v10, s[4:5]
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v4f64_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-GISEL-NEXT:    v_min_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s10
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s11
+; GFX9-GISEL-NEXT:    v_min_f64 v[4:5], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[0:1], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s12
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s13
+; GFX9-GISEL-NEXT:    v_min_f64 v[6:7], s[4:5], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[2:3], s[4:5], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s14
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s15
+; GFX9-GISEL-NEXT:    v_min_f64 v[8:9], s[6:7], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[4:5], s[6:7], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v10, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v10, v3, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v3, v10, v5, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[2:3]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v5, v10, v7, s[2:3]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v8, s[4:5]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v7, v10, v9, s[4:5]
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v4f64_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f64 v[0:1], s[0:1], s[8:9]
+; GFX12-NEXT:    v_minimum_f64 v[2:3], s[2:3], s[10:11]
+; GFX12-NEXT:    v_minimum_f64 v[4:5], s[4:5], s[12:13]
+; GFX12-NEXT:    v_minimum_f64 v[6:7], s[6:7], s[14:15]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x double> @llvm.minimum.v4f64(<4 x double> %a, <4 x double> %b)
   %ret = bitcast <4 x double> %val to <8 x float>
   ret <8 x float> %ret
 }
 
 define amdgpu_kernel void @fminimumi_f32_move_to_valu(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
-; GCN-LABEL: fminimumi_f32_move_to_valu:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_clause 0x1
-; GCN-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GCN-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GCN-NEXT:    s_wait_loadcnt 0x0
-; GCN-NEXT:    global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS
-; GCN-NEXT:    s_wait_loadcnt 0x0
-; GCN-NEXT:    v_minimum_f32 v1, v1, v2
-; GCN-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GCN-NEXT:    s_endpgm
+; GFX9-LABEL: fminimumi_f32_move_to_valu:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v4, v1, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fminimumi_f32_move_to_valu:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_minimum_f32 v1, v1, v2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
   %a = load volatile float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
   %v = call float @llvm.minimum.f32(float %a, float %b)
@@ -305,6 +910,23 @@ define amdgpu_kernel void @fminimumi_f32_move_to_valu(ptr addrspace(1) %out, ptr
 }
 
 define amdgpu_kernel void @fminimum_f16_move_to_valu(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
+; GFX9-LABEL: fminimum_f16_move_to_valu:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v4, v1, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
 ; GFX12-SDAG-TRUE16-LABEL: fminimum_f16_move_to_valu:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_clause 0x1
@@ -371,6 +993,40 @@ define amdgpu_kernel void @fminimum_f16_move_to_valu(ptr addrspace(1) %out, ptr
   ret void
 }
 
+define amdgpu_ps float @test_fminimum_f32_ieee_on(float %a, float %b) #0 {
+; GFX9-LABEL: test_fminimum_f32_ieee_on:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f32_ieee_on:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
+  %val = call float @llvm.minimum.f32(float %a, float %b)
+  ret float %val
+}
+
+define amdgpu_ps float @test_fminimum_f32_ieee_off(float %a, float %b) #1 {
+; GFX9-LABEL: test_fminimum_f32_ieee_off:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f32_ieee_off:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
+  %val = call float @llvm.minimum.f32(float %a, float %b)
+  ret float %val
+}
+
 declare float @llvm.minimum.f32(float, float)
 declare <2 x float> @llvm.minimum.v2f32(<2 x float>, <2 x float>)
 declare <3 x float> @llvm.minimum.v3f32(<3 x float>, <3 x float>)
@@ -383,3 +1039,6 @@ declare <4 x half> @llvm.minimum.v4f16(<4 x half>, <4 x half>)
 declare double @llvm.minimum.f64(double, double)
 declare <2 x double> @llvm.minimum.v2f64(<2 x double>, <2 x double>)
 declare <4 x double> @llvm.minimum.v4f64(<4 x double>, <4 x double>)
+
+attributes #0 = { nounwind "amdgpu-ieee"="true" }
+attributes #1 = { nounwind "amdgpu-ieee"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.legal.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.legal.f16.ll
index d8bbda1..69d1ee3f 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.legal.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.legal.f16.ll
@@ -159,7 +159,7 @@ declare half @llvm.amdgcn.interp.p2.f16(float, float, i32, i32, i1, i32) #0
 
 attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
 attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind "unsafe-fp-math"="true" }
+attributes #2 = { nounwind }
 attributes #3 = { nounwind "no-signed-zeros-fp-math"="true" }
 attributes #4 = { nounwind "amdgpu-ieee"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
index aaea4f7..b3202cb 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
@@ -8006,7 +8006,7 @@ declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0
 
 attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
 attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind "unsafe-fp-math"="true" }
+attributes #2 = { nounwind }
 attributes #3 = { nounwind "no-signed-zeros-fp-math"="true" }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GCN-NSZ: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
index b0dd187..c28b25c7 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -599,10 +599,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; SI-GISEL-NEXT:    s_addk_i32 s3, 0xfc10
 ; SI-GISEL-NEXT:    s_and_b32 s6, s6, 0xffe
 ; SI-GISEL-NEXT:    s_or_b32 s4, s7, s4
-; SI-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; SI-GISEL-NEXT:    s_or_b32 s4, s6, s4
-; SI-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
 ; SI-GISEL-NEXT:    s_lshl_b32 s6, s6, 9
 ; SI-GISEL-NEXT:    s_lshl_b32 s7, s3, 12
@@ -711,10 +709,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; VI-GISEL-NEXT:    s_addk_i32 s4, 0xfc10
 ; VI-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
 ; VI-GISEL-NEXT:    s_or_b32 s2, s6, s2
-; VI-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; VI-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
 ; VI-GISEL-NEXT:    s_or_b32 s2, s5, s2
-; VI-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; VI-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; VI-GISEL-NEXT:    s_sub_i32 s7, 1, s4
 ; VI-GISEL-NEXT:    s_lshl_b32 s6, s4, 12
@@ -824,10 +820,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; GFX9-GISEL-NEXT:    s_addk_i32 s4, 0xfc10
 ; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
 ; GFX9-GISEL-NEXT:    s_or_b32 s2, s6, s2
-; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX9-GISEL-NEXT:    s_or_b32 s2, s5, s2
-; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX9-GISEL-NEXT:    s_sub_i32 s7, 1, s4
 ; GFX9-GISEL-NEXT:    s_lshl_b32 s6, s4, 12
@@ -937,10 +931,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; GFX950-GISEL-NEXT:    s_addk_i32 s4, 0xfc10
 ; GFX950-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
 ; GFX950-GISEL-NEXT:    s_or_b32 s2, s6, s2
-; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX950-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX950-GISEL-NEXT:    s_or_b32 s2, s5, s2
-; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX950-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX950-GISEL-NEXT:    s_sub_i32 s7, 1, s4
 ; GFX950-GISEL-NEXT:    s_lshl_b32 s6, s4, 12
@@ -1118,17 +1110,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-TRUE16-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX11-GISEL-TRUE16-NEXT:    s_bfe_u32 s4, s3, 0xb0014
 ; GFX11-GISEL-TRUE16-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s2, s6, s2
+; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX11-GISEL-TRUE16-NEXT:    s_addk_i32 s4, 0xfc10
 ; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffe
-; GFX11-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s2, s6, s2
 ; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s2, s5, s2
-; GFX11-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX11-GISEL-TRUE16-NEXT:    s_sub_i32 s6, 1, s4
 ; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s8, s2, 0x1000
@@ -1175,17 +1165,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-FAKE16-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX11-GISEL-FAKE16-NEXT:    s_bfe_u32 s4, s3, 0xb0014
 ; GFX11-GISEL-FAKE16-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s2, s6, s2
+; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX11-GISEL-FAKE16-NEXT:    s_addk_i32 s4, 0xfc10
 ; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffe
-; GFX11-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s2, s6, s2
 ; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s2, s5, s2
-; GFX11-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX11-GISEL-FAKE16-NEXT:    s_sub_i32 s6, 1, s4
 ; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s8, s2, 0x1000
@@ -1366,17 +1354,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX1250-GISEL-TRUE16-NEXT:    s_bfe_u32 s4, s3, 0xb0014
 ; GFX1250-GISEL-TRUE16-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s2, s6, s2
+; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX1250-GISEL-TRUE16-NEXT:    s_addk_co_i32 s4, 0xfc10
 ; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffe
-; GFX1250-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s2, s6, s2
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s2, s5, s2
-; GFX1250-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_sub_co_i32 s6, 1, s4
 ; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s8, s2, 0x1000
@@ -1423,17 +1409,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX1250-GISEL-FAKE16-NEXT:    s_bfe_u32 s4, s3, 0xb0014
 ; GFX1250-GISEL-FAKE16-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s2, s6, s2
+; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX1250-GISEL-FAKE16-NEXT:    s_addk_co_i32 s4, 0xfc10
 ; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffe
-; GFX1250-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s2, s6, s2
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s2, s5, s2
-; GFX1250-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_sub_co_i32 s6, 1, s4
 ; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s8, s2, 0x1000
@@ -2154,10 +2138,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; SI-GISEL-NEXT:    s_addk_i32 s3, 0xfc10
 ; SI-GISEL-NEXT:    s_and_b32 s8, s8, 0xffe
 ; SI-GISEL-NEXT:    s_or_b32 s4, s9, s4
-; SI-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; SI-GISEL-NEXT:    s_or_b32 s4, s8, s4
-; SI-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-GISEL-NEXT:    s_cselect_b32 s8, 1, 0
 ; SI-GISEL-NEXT:    s_lshl_b32 s8, s8, 9
 ; SI-GISEL-NEXT:    s_lshl_b32 s9, s3, 12
@@ -2193,12 +2175,10 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; SI-GISEL-NEXT:    s_and_b32 s4, s4, 0x8000
 ; SI-GISEL-NEXT:    s_addk_i32 s5, 0xfc10
 ; SI-GISEL-NEXT:    s_and_b32 s8, s8, 0xffe
-; SI-GISEL-NEXT:    s_or_b32 s6, s9, s6
 ; SI-GISEL-NEXT:    s_or_b32 s3, s4, s3
-; SI-GISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; SI-GISEL-NEXT:    s_or_b32 s4, s9, s6
 ; SI-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; SI-GISEL-NEXT:    s_or_b32 s4, s8, s4
-; SI-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
 ; SI-GISEL-NEXT:    s_lshl_b32 s6, s6, 9
 ; SI-GISEL-NEXT:    s_lshl_b32 s8, s5, 12
@@ -2355,10 +2335,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; VI-GISEL-NEXT:    s_addk_i32 s2, 0xfc10
 ; VI-GISEL-NEXT:    s_and_b32 s3, s3, 0xffe
 ; VI-GISEL-NEXT:    s_or_b32 s4, s8, s4
-; VI-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; VI-GISEL-NEXT:    s_or_b32 s3, s3, s4
-; VI-GISEL-NEXT:    s_cmp_lg_u32 s3, 0
 ; VI-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; VI-GISEL-NEXT:    s_sub_i32 s9, 1, s2
 ; VI-GISEL-NEXT:    s_lshl_b32 s8, s2, 12
@@ -2392,14 +2370,12 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; VI-GISEL-NEXT:    s_or_b32 s2, s3, s2
 ; VI-GISEL-NEXT:    s_bfe_u32 s3, s7, 0xb0014
 ; VI-GISEL-NEXT:    s_lshr_b32 s4, s7, 8
-; VI-GISEL-NEXT:    s_and_b32 s5, s7, 0x1ff
 ; VI-GISEL-NEXT:    s_addk_i32 s3, 0xfc10
 ; VI-GISEL-NEXT:    s_and_b32 s4, s4, 0xffe
+; VI-GISEL-NEXT:    s_and_b32 s5, s7, 0x1ff
 ; VI-GISEL-NEXT:    s_or_b32 s5, s5, s6
-; VI-GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; VI-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; VI-GISEL-NEXT:    s_or_b32 s4, s4, s5
-; VI-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; VI-GISEL-NEXT:    s_sub_i32 s8, 1, s3
 ; VI-GISEL-NEXT:    s_lshl_b32 s6, s3, 12
@@ -2555,10 +2531,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX9-GISEL-NEXT:    s_addk_i32 s2, 0xfc10
 ; GFX9-GISEL-NEXT:    s_and_b32 s3, s3, 0xffe
 ; GFX9-GISEL-NEXT:    s_or_b32 s4, s8, s4
-; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX9-GISEL-NEXT:    s_or_b32 s3, s3, s4
-; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX9-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX9-GISEL-NEXT:    s_sub_i32 s9, 1, s2
 ; GFX9-GISEL-NEXT:    s_lshl_b32 s8, s2, 12
@@ -2592,14 +2566,12 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX9-GISEL-NEXT:    s_or_b32 s2, s3, s2
 ; GFX9-GISEL-NEXT:    s_bfe_u32 s3, s7, 0xb0014
 ; GFX9-GISEL-NEXT:    s_lshr_b32 s4, s7, 8
-; GFX9-GISEL-NEXT:    s_and_b32 s5, s7, 0x1ff
 ; GFX9-GISEL-NEXT:    s_addk_i32 s3, 0xfc10
 ; GFX9-GISEL-NEXT:    s_and_b32 s4, s4, 0xffe
+; GFX9-GISEL-NEXT:    s_and_b32 s5, s7, 0x1ff
 ; GFX9-GISEL-NEXT:    s_or_b32 s5, s5, s6
-; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX9-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX9-GISEL-NEXT:    s_or_b32 s4, s4, s5
-; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX9-GISEL-NEXT:    s_sub_i32 s8, 1, s3
 ; GFX9-GISEL-NEXT:    s_lshl_b32 s6, s3, 12
@@ -2752,10 +2724,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX950-GISEL-NEXT:    s_addk_i32 s2, 0xfc10
 ; GFX950-GISEL-NEXT:    s_and_b32 s3, s3, 0xffe
 ; GFX950-GISEL-NEXT:    s_or_b32 s4, s8, s4
-; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX950-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX950-GISEL-NEXT:    s_or_b32 s3, s3, s4
-; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX950-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX950-GISEL-NEXT:    s_sub_i32 s9, 1, s2
 ; GFX950-GISEL-NEXT:    s_lshl_b32 s8, s2, 12
@@ -2789,14 +2759,12 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX950-GISEL-NEXT:    s_or_b32 s2, s3, s2
 ; GFX950-GISEL-NEXT:    s_bfe_u32 s3, s7, 0xb0014
 ; GFX950-GISEL-NEXT:    s_lshr_b32 s4, s7, 8
-; GFX950-GISEL-NEXT:    s_and_b32 s5, s7, 0x1ff
 ; GFX950-GISEL-NEXT:    s_addk_i32 s3, 0xfc10
 ; GFX950-GISEL-NEXT:    s_and_b32 s4, s4, 0xffe
+; GFX950-GISEL-NEXT:    s_and_b32 s5, s7, 0x1ff
 ; GFX950-GISEL-NEXT:    s_or_b32 s5, s5, s6
-; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX950-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX950-GISEL-NEXT:    s_or_b32 s4, s4, s5
-; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX950-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX950-GISEL-NEXT:    s_sub_i32 s8, 1, s3
 ; GFX950-GISEL-NEXT:    s_lshl_b32 s6, s3, 12
@@ -3073,17 +3041,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-TRUE16-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s8, s5, 0x1ff
 ; GFX11-GISEL-TRUE16-NEXT:    s_bfe_u32 s2, s5, 0xb0014
 ; GFX11-GISEL-TRUE16-NEXT:    s_lshr_b32 s3, s5, 8
-; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s4, s8, s4
+; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s8, s5, 0x1ff
 ; GFX11-GISEL-TRUE16-NEXT:    s_addk_i32 s2, 0xfc10
 ; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s3, s3, 0xffe
-; GFX11-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s4, s8, s4
 ; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s3, s3, s4
-; GFX11-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX11-GISEL-TRUE16-NEXT:    s_sub_i32 s8, 1, s2
 ; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s10, s3, 0x1000
@@ -3115,19 +3081,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX11-GISEL-TRUE16-NEXT:    s_cmpk_eq_i32 s2, 0x40f
 ; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s2, s4, s3
 ; GFX11-GISEL-TRUE16-NEXT:    s_lshr_b32 s3, s5, 16
-; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s8, s7, 0x1ff
 ; GFX11-GISEL-TRUE16-NEXT:    s_bfe_u32 s4, s7, 0xb0014
 ; GFX11-GISEL-TRUE16-NEXT:    s_lshr_b32 s5, s7, 8
 ; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s3, s3, 0x8000
-; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s6, s8, s6
+; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s8, s7, 0x1ff
 ; GFX11-GISEL-TRUE16-NEXT:    s_addk_i32 s4, 0xfc10
 ; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffe
 ; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s2, s3, s2
-; GFX11-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s3, s8, s6
 ; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s3, s5, s3
-; GFX11-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX11-GISEL-TRUE16-NEXT:    s_sub_i32 s6, 1, s4
 ; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s9, s3, 0x1000
@@ -3176,17 +3140,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-FAKE16-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
 ; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s8, s5, 0x1ff
 ; GFX11-GISEL-FAKE16-NEXT:    s_bfe_u32 s2, s5, 0xb0014
 ; GFX11-GISEL-FAKE16-NEXT:    s_lshr_b32 s3, s5, 8
-; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s4, s8, s4
+; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s8, s5, 0x1ff
 ; GFX11-GISEL-FAKE16-NEXT:    s_addk_i32 s2, 0xfc10
 ; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s3, s3, 0xffe
-; GFX11-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s4, s8, s4
 ; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s3, s3, s4
-; GFX11-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX11-GISEL-FAKE16-NEXT:    s_sub_i32 s8, 1, s2
 ; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s10, s3, 0x1000
@@ -3218,19 +3180,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX11-GISEL-FAKE16-NEXT:    s_cmpk_eq_i32 s2, 0x40f
 ; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s2, s4, s3
 ; GFX11-GISEL-FAKE16-NEXT:    s_lshr_b32 s3, s5, 16
-; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s8, s7, 0x1ff
 ; GFX11-GISEL-FAKE16-NEXT:    s_bfe_u32 s4, s7, 0xb0014
 ; GFX11-GISEL-FAKE16-NEXT:    s_lshr_b32 s5, s7, 8
 ; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s3, s3, 0x8000
-; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s6, s8, s6
+; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s8, s7, 0x1ff
 ; GFX11-GISEL-FAKE16-NEXT:    s_addk_i32 s4, 0xfc10
 ; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffe
 ; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s2, s3, s2
-; GFX11-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s3, s8, s6
 ; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s3, s5, s3
-; GFX11-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX11-GISEL-FAKE16-NEXT:    s_sub_i32 s6, 1, s4
 ; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s9, s3, 0x1000
@@ -3511,17 +3471,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s8, s5, 0x1ff
 ; GFX1250-GISEL-TRUE16-NEXT:    s_bfe_u32 s2, s5, 0xb0014
 ; GFX1250-GISEL-TRUE16-NEXT:    s_lshr_b32 s3, s5, 8
-; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s4, s8, s4
+; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s8, s5, 0x1ff
 ; GFX1250-GISEL-TRUE16-NEXT:    s_addk_co_i32 s2, 0xfc10
 ; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s3, s3, 0xffe
-; GFX1250-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s4, s8, s4
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s3, s3, s4
-; GFX1250-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_sub_co_i32 s8, 1, s2
 ; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s10, s3, 0x1000
@@ -3553,19 +3511,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cmp_eq_u32 s2, 0x40f
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cselect_b32 s2, s4, s3
 ; GFX1250-GISEL-TRUE16-NEXT:    s_lshr_b32 s3, s5, 16
-; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s8, s7, 0x1ff
 ; GFX1250-GISEL-TRUE16-NEXT:    s_bfe_u32 s4, s7, 0xb0014
 ; GFX1250-GISEL-TRUE16-NEXT:    s_lshr_b32 s5, s7, 8
 ; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s3, s3, 0x8000
-; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s6, s8, s6
+; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s8, s7, 0x1ff
 ; GFX1250-GISEL-TRUE16-NEXT:    s_addk_co_i32 s4, 0xfc10
 ; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffe
 ; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s2, s3, s2
-; GFX1250-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s3, s8, s6
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s3, s5, s3
-; GFX1250-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_sub_co_i32 s6, 1, s4
 ; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s9, s3, 0x1000
@@ -3614,17 +3570,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s8, s5, 0x1ff
 ; GFX1250-GISEL-FAKE16-NEXT:    s_bfe_u32 s2, s5, 0xb0014
 ; GFX1250-GISEL-FAKE16-NEXT:    s_lshr_b32 s3, s5, 8
-; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s4, s8, s4
+; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s8, s5, 0x1ff
 ; GFX1250-GISEL-FAKE16-NEXT:    s_addk_co_i32 s2, 0xfc10
 ; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s3, s3, 0xffe
-; GFX1250-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s4, s8, s4
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s3, s3, s4
-; GFX1250-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_sub_co_i32 s8, 1, s2
 ; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s10, s3, 0x1000
@@ -3656,19 +3610,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cmp_eq_u32 s2, 0x40f
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cselect_b32 s2, s4, s3
 ; GFX1250-GISEL-FAKE16-NEXT:    s_lshr_b32 s3, s5, 16
-; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s8, s7, 0x1ff
 ; GFX1250-GISEL-FAKE16-NEXT:    s_bfe_u32 s4, s7, 0xb0014
 ; GFX1250-GISEL-FAKE16-NEXT:    s_lshr_b32 s5, s7, 8
 ; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s3, s3, 0x8000
-; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s6, s8, s6
+; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s8, s7, 0x1ff
 ; GFX1250-GISEL-FAKE16-NEXT:    s_addk_co_i32 s4, 0xfc10
 ; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffe
 ; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s2, s3, s2
-; GFX1250-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s3, s8, s6
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s3, s5, s3
-; GFX1250-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_sub_co_i32 s6, 1, s4
 ; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s9, s3, 0x1000
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index f1165491..b6b26a4 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -182,7 +182,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; SI-NEXT:    s_and_b32 s1, s7, 0x1ff
 ; SI-NEXT:    s_and_b32 s8, s0, 0xffe
 ; SI-NEXT:    s_or_b32 s0, s1, s6
-; SI-NEXT:    s_cmp_lg_u32 s0, 0
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; SI-NEXT:    s_bfe_u32 s0, s7, 0xb0014
@@ -237,7 +236,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; VI-SDAG-NEXT:    s_and_b32 s8, s4, 0xffe
 ; VI-SDAG-NEXT:    s_and_b32 s4, s7, 0x1ff
 ; VI-SDAG-NEXT:    s_or_b32 s4, s4, s6
-; VI-SDAG-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-SDAG-NEXT:    s_mov_b32 s1, s5
 ; VI-SDAG-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
@@ -290,10 +288,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; VI-GISEL-NEXT:    s_addk_i32 s4, 0xfc10
 ; VI-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
 ; VI-GISEL-NEXT:    s_or_b32 s2, s6, s2
-; VI-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; VI-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
 ; VI-GISEL-NEXT:    s_or_b32 s2, s5, s2
-; VI-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; VI-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; VI-GISEL-NEXT:    s_sub_i32 s7, 1, s4
 ; VI-GISEL-NEXT:    s_lshl_b32 s6, s4, 12
@@ -335,11 +331,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_and_b32 s4, s3, 0x1ff
-; GFX10-SDAG-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX10-SDAG-NEXT:    s_or_b32 s2, s4, s2
-; GFX10-SDAG-NEXT:    s_and_b32 s4, s5, 0xffe
-; GFX10-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX10-SDAG-NEXT:    s_lshr_b32 s4, s3, 8
+; GFX10-SDAG-NEXT:    s_and_b32 s5, s3, 0x1ff
+; GFX10-SDAG-NEXT:    s_and_b32 s4, s4, 0xffe
+; GFX10-SDAG-NEXT:    s_or_b32 s2, s5, s2
 ; GFX10-SDAG-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
 ; GFX10-SDAG-NEXT:    s_bfe_u32 s2, s3, 0xb0014
@@ -387,16 +382,14 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX10-GISEL-NEXT:    s_bfe_u32 s4, s3, 0xb0014
 ; GFX10-GISEL-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX10-GISEL-NEXT:    s_or_b32 s2, s6, s2
+; GFX10-GISEL-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX10-GISEL-NEXT:    s_addk_i32 s4, 0xfc10
 ; GFX10-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
-; GFX10-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX10-GISEL-NEXT:    s_or_b32 s2, s6, s2
 ; GFX10-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX10-GISEL-NEXT:    s_or_b32 s2, s5, s2
-; GFX10-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX10-GISEL-NEXT:    s_sub_i32 s6, 1, s4
 ; GFX10-GISEL-NEXT:    s_or_b32 s8, s2, 0x1000
@@ -438,11 +431,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_and_b32 s4, s3, 0x1ff
-; GFX11-SDAG-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX11-SDAG-NEXT:    s_or_b32 s2, s4, s2
-; GFX11-SDAG-NEXT:    s_and_b32 s4, s5, 0xffe
-; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT:    s_lshr_b32 s4, s3, 8
+; GFX11-SDAG-NEXT:    s_and_b32 s5, s3, 0x1ff
+; GFX11-SDAG-NEXT:    s_and_b32 s4, s4, 0xffe
+; GFX11-SDAG-NEXT:    s_or_b32 s2, s5, s2
 ; GFX11-SDAG-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
@@ -498,17 +490,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX11-GISEL-NEXT:    s_bfe_u32 s4, s3, 0xb0014
 ; GFX11-GISEL-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX11-GISEL-NEXT:    s_or_b32 s2, s6, s2
+; GFX11-GISEL-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX11-GISEL-NEXT:    s_addk_i32 s4, 0xfc10
 ; GFX11-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
-; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-NEXT:    s_or_b32 s2, s6, s2
 ; GFX11-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    s_or_b32 s2, s5, s2
-; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX11-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX11-GISEL-NEXT:    s_sub_i32 s6, 1, s4
 ; GFX11-GISEL-NEXT:    s_or_b32 s8, s2, 0x1000
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 6f91222..d8cbdb1 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -2048,7 +2048,7 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1200-FAKE16-NEXT:    v_fmac_f16_e32 v1, v3, v2
 ; GFX1200-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX1200-FAKE16-NEXT:    s_endpgm
-                             ptr addrspace(1) %in2) #1 {
+                             ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
    %r0 = load half, ptr addrspace(1) %in1, align 4
    %r1 = load half, ptr addrspace(1) %gep2, align 4
@@ -3417,7 +3417,7 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1200-NEXT:    v_fmac_f32_e32 v1, v3, v2
 ; GFX1200-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX1200-NEXT:    s_endpgm
-                             ptr addrspace(1) %in2) #1 {
+                             ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
    %r0 = load float, ptr addrspace(1) %in1, align 4
    %r1 = load float, ptr addrspace(1) %gep2, align 4
@@ -4821,7 +4821,7 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1200-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
 ; GFX1200-NEXT:    global_store_b64 v12, v[0:1], s[0:1]
 ; GFX1200-NEXT:    s_endpgm
-                             ptr addrspace(1) %in2) #1 {
+                             ptr addrspace(1) %in2) #0 {
    %r0 = load double, ptr addrspace(1) %in1, align 8
    %r1 = load double, ptr addrspace(1) %in2, align 8
    %r2 = frem afn double %r0, %r1
@@ -18918,7 +18918,4 @@ define amdgpu_kernel void @frem_v2f64_const(ptr addrspace(1) %out) #0 {
 
 
 
-attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
-attributes #1 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
-
-
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
index 1b74ddf..9b97981 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
@@ -2870,7 +2870,7 @@ define double @v_sqrt_f64__enough_unsafe_attrs(double %x) #3 {
   ret double %result
 }
 
-define double @v_sqrt_f64__unsafe_attr(double %x) #4 {
+define double @v_sqrt_f64__unsafe_attr(double %x) {
 ; GFX6-SDAG-LABEL: v_sqrt_f64__unsafe_attr:
 ; GFX6-SDAG:       ; %bb.0:
 ; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3449,7 +3449,6 @@ declare i32 @llvm.amdgcn.readfirstlane(i32) #1
 attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 attributes #1 = { convergent nounwind willreturn memory(none) }
 attributes #3 = { "no-nans-fp-math"="true" "no-infs-fp-math"="true" }
-attributes #4 = { "unsafe-fp-math"="true" }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX6: {{.*}}
 ; GFX8: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.r600.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.r600.ll
index 9f19bcb..c93c077 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.r600.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.r600.ll
@@ -239,4 +239,4 @@ declare <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) #0
 declare <4 x float> @llvm.sqrt.v4f32(<4 x float> %in) #0
 
 attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind "unsafe-fp-math"="true" }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index 94afa88..9ebf6ae 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -4666,21 +4666,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_16(ptr add
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
-; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, 16, v0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    ; return to shader part epilog
-;
-; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16:
-; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v0, 16, v0
-; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT:    ; return to shader part epilog
+; GFX12-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_or_b32_e32 v0, 16, v0
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.idx = zext i32 %idx to i64
   %or = or i64 %zext.idx, 16
   %addr = inttoptr i64 %or to ptr addrspace(1)
@@ -4707,21 +4699,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_4160(ptr a
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
-; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, 0x1040, v0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    ; return to shader part epilog
-;
-; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160:
-; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v0, 0x1040, v0
-; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT:    ; return to shader part epilog
+; GFX12-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_or_b32_e32 v0, 0x1040, v0
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.idx = zext i32 %idx to i64
   %or = or i64 %zext.idx, 4160
   %addr = inttoptr i64 %or to ptr addrspace(1)
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 37756d1..31f277f 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -472,7 +472,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -536,11 +535,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -606,7 +604,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -660,12 +657,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v1, s3, v1
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v1, s4, v1
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -710,9 +706,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v1, s2, v1
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1690,7 +1685,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1754,11 +1748,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1824,7 +1817,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1878,12 +1870,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v1, s3, v1
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v1, s4, v1
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1928,9 +1919,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v1, s2, v1
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2968,7 +2958,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3032,11 +3021,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3102,7 +3090,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3156,12 +3143,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v1, s3, v1
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v1, s4, v1
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3206,9 +3192,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v1, s2, v1
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3742,7 +3727,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3806,11 +3790,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3876,7 +3859,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3930,12 +3912,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v1, s3, v1
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v1, s4, v1
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3980,9 +3961,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v1, s2, v1
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5019,7 +4999,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5083,11 +5062,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX1064-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5153,7 +5131,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5207,12 +5184,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5270,9 +5246,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6284,7 +6259,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6354,7 +6328,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6424,7 +6397,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6485,8 +6457,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6550,7 +6520,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7717,7 +7686,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7787,7 +7755,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7857,7 +7824,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7918,8 +7884,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7983,7 +7947,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9150,7 +9113,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9220,7 +9182,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9290,7 +9251,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9351,8 +9311,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9416,7 +9374,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10065,7 +10022,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10135,7 +10091,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10205,7 +10160,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10266,8 +10220,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10331,7 +10283,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11498,7 +11449,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11568,7 +11518,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11638,7 +11587,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11699,8 +11647,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11764,7 +11710,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 6351bb3..4581efc 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -381,13 +381,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX9-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX9-NEXT:    v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT:    v_max_f32_e64 v2, s4, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v1
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -457,7 +456,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -513,7 +511,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
 ; GFX1032-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB1_1
@@ -562,8 +559,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -610,11 +606,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1420,13 +1414,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX9-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX9-NEXT:    v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT:    v_max_f32_e64 v2, s4, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v1
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1496,7 +1489,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1552,7 +1544,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
 ; GFX1032-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB3_1
@@ -1601,8 +1592,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1649,11 +1639,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2459,13 +2447,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX9-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX9-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX9-NEXT:    v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT:    v_max_f32_e64 v2, s4, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v1
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2535,7 +2522,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2591,7 +2577,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
 ; GFX1032-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB5_1
@@ -2640,8 +2625,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2688,11 +2672,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3591,7 +3573,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3665,7 +3646,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3724,7 +3704,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3774,8 +3753,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3841,10 +3819,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4859,7 +4836,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX9-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4933,7 +4909,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4992,7 +4967,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5042,8 +5016,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1164-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5109,10 +5082,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6127,7 +6099,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX9-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6201,7 +6172,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6260,7 +6230,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6310,8 +6279,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6377,10 +6345,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index a9ac008..bd570d9 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -381,13 +381,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX9-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX9-NEXT:    v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT:    v_max_f32_e64 v2, s4, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v1
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -457,7 +456,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -513,7 +511,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
 ; GFX1032-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB1_1
@@ -562,8 +559,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -610,11 +606,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1420,13 +1414,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX9-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX9-NEXT:    v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT:    v_max_f32_e64 v2, s4, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v1
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1496,7 +1489,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1552,7 +1544,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
 ; GFX1032-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB3_1
@@ -1601,8 +1592,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1649,11 +1639,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2459,13 +2447,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX9-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX9-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX9-NEXT:    v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT:    v_max_f32_e64 v2, s4, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v1
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2535,7 +2522,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2591,7 +2577,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
 ; GFX1032-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB5_1
@@ -2640,8 +2625,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2688,11 +2672,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3591,7 +3573,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3665,7 +3646,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3724,7 +3704,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3774,8 +3753,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3841,10 +3819,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4859,7 +4836,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX9-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4933,7 +4909,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4992,7 +4967,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5042,8 +5016,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1164-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5109,10 +5082,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6127,7 +6099,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX9-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6201,7 +6172,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6260,7 +6230,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6310,8 +6279,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6377,10 +6345,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 6311143..1f2d70c 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -532,7 +532,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -596,11 +595,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -666,7 +664,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -720,12 +717,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -783,9 +779,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1862,7 +1857,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1926,11 +1920,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1996,7 +1989,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2050,12 +2042,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2113,9 +2104,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3192,7 +3182,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3256,11 +3245,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3326,7 +3314,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3380,12 +3367,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3443,9 +3429,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4018,7 +4003,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4082,11 +4066,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4152,7 +4135,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4206,12 +4188,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4269,9 +4250,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5347,7 +5327,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5411,11 +5390,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX1064-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5481,7 +5459,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5535,12 +5512,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5598,9 +5574,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6612,7 +6587,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6682,7 +6656,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6752,7 +6725,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6813,8 +6785,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6878,7 +6848,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8044,7 +8013,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8114,7 +8082,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8184,7 +8151,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8245,8 +8211,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8310,7 +8274,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9477,7 +9440,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9547,7 +9509,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9617,7 +9578,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9678,8 +9638,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9743,7 +9701,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10392,7 +10349,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10462,7 +10418,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10532,7 +10487,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10593,8 +10547,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10658,7 +10610,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11824,7 +11775,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11894,7 +11844,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11964,7 +11913,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12025,8 +11973,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12090,7 +12036,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/inline-attr.ll b/llvm/test/CodeGen/AMDGPU/inline-attr.ll
index 4e93eca..c33b3344 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-attr.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-attr.ll
@@ -36,18 +36,18 @@ entry:
   ret void
 }
 
-attributes #0 = { nounwind "uniform-work-group-size"="false" "unsafe-fp-math"="true"}
-attributes #1 = { nounwind "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" }
+attributes #0 = { nounwind "uniform-work-group-size"="false"}
+attributes #1 = { nounwind "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" }
 
 ;.
-; UNSAFE: attributes #[[ATTR0]] = { nounwind "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
-; UNSAFE: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
+; UNSAFE: attributes #[[ATTR0]] = { nounwind "uniform-work-group-size"="false" }
+; UNSAFE: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "uniform-work-group-size"="false" }
 ;.
-; NONANS: attributes #[[ATTR0]] = { nounwind "no-nans-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
-; NONANS: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
+; NONANS: attributes #[[ATTR0]] = { nounwind "no-nans-fp-math"="true" "uniform-work-group-size"="false" }
+; NONANS: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="true" "uniform-work-group-size"="false" }
 ;.
-; NOINFS: attributes #[[ATTR0]] = { nounwind "no-infs-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
-; NOINFS: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="false" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
+; NOINFS: attributes #[[ATTR0]] = { nounwind "no-infs-fp-math"="true" "uniform-work-group-size"="false" }
+; NOINFS: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="false" "uniform-work-group-size"="false" }
 ;.
 ; UNSAFE: [[META0]] = !{}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index eee232a..c3f3917 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -136,19 +136,17 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
 ; GFX11-NEXT:  .LBB2_6: ; %bb18
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT:    v_readfirstlane_b32 s13, v0
-; GFX11-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s1
-; GFX11-NEXT:    s_and_b32 s1, s8, s1
-; GFX11-NEXT:    s_and_b32 s1, s1, exec_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_cselect_b32 s13, -1, 0
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s13
+; GFX11-NEXT:    s_and_b32 s13, s8, s13
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_and_b32 s13, s13, exec_lo
 ; GFX11-NEXT:    v_readfirstlane_b32 s19, v2
-; GFX11-NEXT:    s_cselect_b32 s1, s19, s13
-; GFX11-NEXT:    s_and_b32 s13, 0xffff, s0
+; GFX11-NEXT:    s_cselect_b32 s1, s19, s1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s1, s1, 1
-; GFX11-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX11-NEXT:    s_and_b32 s13, 0xffff, s0
 ; GFX11-NEXT:    s_cselect_b32 s13, -1, 0
 ; GFX11-NEXT:    s_and_b32 s20, s9, exec_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s13
diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
index 9684712..2f9182e 100644
--- a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
@@ -1066,13 +1066,13 @@ define double @uitofp_i128_to_f64(i128 %x) {
 ; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
 ; GISEL-NEXT:    v_lshlrev_b64 v[8:9], 30, v[2:3]
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 2, v1
-; GISEL-NEXT:    v_or_b32_e32 v9, v5, v8
+; GISEL-NEXT:    v_or_b32_e32 v9, v8, v5
 ; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GISEL-NEXT:  ; %bb.11: ; %itofp-if-then20
 ; GISEL-NEXT:    v_lshlrev_b64 v[2:3], 29, v[2:3]
 ; GISEL-NEXT:    v_lshrrev_b64 v[4:5], 3, v[0:1]
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v0, 3, v1
-; GISEL-NEXT:    v_or_b32_e32 v9, v0, v2
+; GISEL-NEXT:    v_or_b32_e32 v9, v2, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v7, v6
 ; GISEL-NEXT:  ; %bb.12: ; %Flow
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.add.min.max.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.add.min.max.ll
new file mode 100644
index 0000000..99421d4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.add.min.max.ll
@@ -0,0 +1,191 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250-GISEL %s
+
+declare i32 @llvm.amdgcn.add.min.i32(i32, i32, i32, i1)
+declare i32 @llvm.amdgcn.add.max.i32(i32, i32, i32, i1)
+declare i32 @llvm.amdgcn.add.min.u32(i32, i32, i32, i1)
+declare i32 @llvm.amdgcn.add.max.u32(i32, i32, i32, i1)
+declare <2 x i16> @llvm.amdgcn.pk.add.min.i16(<2 x i16>, <2 x i16>, <2 x i16>, i1)
+declare <2 x i16> @llvm.amdgcn.pk.add.max.i16(<2 x i16>, <2 x i16>, <2 x i16>, i1)
+declare <2 x i16> @llvm.amdgcn.pk.add.min.u16(<2 x i16>, <2 x i16>, <2 x i16>, i1)
+declare <2 x i16> @llvm.amdgcn.pk.add.max.u16(<2 x i16>, <2 x i16>, <2 x i16>, i1)
+
+define i32 @test_add_min_i32_vvv(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: test_add_min_i32_vvv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_add_min_i32 v0, v0, v1, v2
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call i32 @llvm.amdgcn.add.min.i32(i32 %a, i32 %b, i32 %c, i1 0)
+  ret i32 %ret
+}
+
+define i32 @test_add_min_i32_ssi_clamp(i32 inreg %a, i32 inreg %b) {
+; GCN-LABEL: test_add_min_i32_ssi_clamp:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_add_min_i32 v0, s0, s1, 1 clamp
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call i32 @llvm.amdgcn.add.min.i32(i32 %a, i32 %b, i32 1, i1 1)
+  ret i32 %ret
+}
+
+define i32 @test_add_min_u32_vvv(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: test_add_min_u32_vvv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_add_min_u32 v0, v0, v1, v2
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call i32 @llvm.amdgcn.add.min.u32(i32 %a, i32 %b, i32 %c, i1 0)
+  ret i32 %ret
+}
+
+define i32 @test_add_min_u32_ssi_clamp(i32 inreg %a, i32 inreg %b) {
+; GCN-LABEL: test_add_min_u32_ssi_clamp:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_add_min_u32 v0, s0, s1, 1 clamp
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call i32 @llvm.amdgcn.add.min.u32(i32 %a, i32 %b, i32 1, i1 1)
+  ret i32 %ret
+}
+
+define i32 @test_add_max_i32_vvv(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: test_add_max_i32_vvv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_add_max_i32 v0, v0, v1, v2
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call i32 @llvm.amdgcn.add.max.i32(i32 %a, i32 %b, i32 %c, i1 0)
+  ret i32 %ret
+}
+
+define i32 @test_add_max_i32_ssi_clamp(i32 inreg %a, i32 inreg %b) {
+; GCN-LABEL: test_add_max_i32_ssi_clamp:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_add_max_i32 v0, s0, s1, 1 clamp
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call i32 @llvm.amdgcn.add.max.i32(i32 %a, i32 %b, i32 1, i1 1)
+  ret i32 %ret
+}
+
+define i32 @test_add_max_u32_vvv(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: test_add_max_u32_vvv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_add_max_u32 v0, v0, v1, v2
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call i32 @llvm.amdgcn.add.max.u32(i32 %a, i32 %b, i32 %c, i1 0)
+  ret i32 %ret
+}
+
+define i32 @test_add_max_u32_ssi_clamp(i32 inreg %a, i32 inreg %b) {
+; GCN-LABEL: test_add_max_u32_ssi_clamp:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_add_max_u32 v0, s0, s1, 1 clamp
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call i32 @llvm.amdgcn.add.max.u32(i32 %a, i32 %b, i32 1, i1 1)
+  ret i32 %ret
+}
+
+define <2 x i16> @test_add_min_i16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
+; GCN-LABEL: test_add_min_i16_vvv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_pk_add_min_i16 v0, v0, v1, v2
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.min.i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, i1 0)
+  ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_add_min_i16_ssi_clamp(<2 x i16> inreg %a, <2 x i16> inreg %b) {
+; GCN-LABEL: test_add_min_i16_ssi_clamp:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_pk_add_min_i16 v0, s0, s1, 1 op_sel_hi:[1,1,0] clamp
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.min.i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> <i16 1, i16 1>, i1 1)
+  ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_add_min_u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
+; GCN-LABEL: test_add_min_u16_vvv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_pk_add_min_u16 v0, v0, v1, v2
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.min.u16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, i1 0)
+  ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_add_min_u16_ssi_clamp(<2 x i16> inreg %a, <2 x i16> inreg %b) {
+; GCN-LABEL: test_add_min_u16_ssi_clamp:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_pk_add_min_u16 v0, s0, s1, 1 op_sel_hi:[1,1,0] clamp
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.min.u16(<2 x i16> %a, <2 x i16> %b, <2 x i16> <i16 1, i16 1>, i1 1)
+  ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_add_max_i16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
+; GCN-LABEL: test_add_max_i16_vvv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_pk_add_max_i16 v0, v0, v1, v2
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.max.i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, i1 0)
+  ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_add_max_i16_ssi_clamp(<2 x i16> inreg %a, <2 x i16> inreg %b) {
+; GCN-LABEL: test_add_max_i16_ssi_clamp:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_pk_add_max_i16 v0, s0, s1, 1 op_sel_hi:[1,1,0] clamp
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.max.i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> <i16 1, i16 1>, i1 1)
+  ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_add_max_u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
+; GCN-LABEL: test_add_max_u16_vvv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_pk_add_max_u16 v0, v0, v1, v2
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.max.u16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, i1 0)
+  ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_add_max_u16_ssi_clamp(<2 x i16> inreg %a, <2 x i16> inreg %b) {
+; GCN-LABEL: test_add_max_u16_ssi_clamp:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_pk_add_max_u16 v0, s0, s1, 1 op_sel_hi:[1,1,0] clamp
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.max.u16(<2 x i16> %a, <2 x i16> %b, <2 x i16> <i16 1, i16 1>, i1 1)
+  ret <2 x i16> %ret
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1250-GISEL: {{.*}}
+; GFX1250-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index 883db20..e30a586 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -1485,7 +1485,7 @@ define float @v_exp2_f32_fast(float %in) {
   ret float %result
 }
 
-define float @v_exp2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
+define float @v_exp2_f32_unsafe_math_attr(float %in) {
 ; SI-SDAG-LABEL: v_exp2_f32_unsafe_math_attr:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index 0854134..61a777f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -1907,7 +1907,7 @@ define float @v_log2_f32_fast(float %in) {
   ret float %result
 }
 
-define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
+define float @v_log2_f32_unsafe_math_attr(float %in) {
 ; SI-SDAG-LABEL: v_log2_f32_unsafe_math_attr:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
index 8748aff..6dc9199 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
@@ -8265,12 +8265,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_readlane_b32 s6, v1, s3
-; GFX12-NEXT:    s_lshl_b32 s7, 1, s3
 ; GFX12-NEXT:    v_writelane_b32 v0, s0, s3
+; GFX12-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_and_not1_b32 s1, s1, s7
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12-NEXT:    s_and_not1_b32 s1, s1, s3
 ; GFX12-NEXT:    s_add_f32 s0, s0, s6
 ; GFX12-NEXT:    s_cbranch_scc1 .LBB28_5
 ; GFX12-NEXT:  ; %bb.6: ; %ComputeEnd
@@ -8351,14 +8349,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX942-NEXT:  .LBB28_5: ; %ComputeLoop
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX942-NEXT:    s_lshl_b64 s[6:7], 1, s3
-; GFX942-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX942-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX942-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX942-NEXT:    s_mov_b32 m0, s3
+; GFX942-NEXT:    v_readlane_b32 s8, v2, s3
+; GFX942-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX942-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX942-NEXT:    v_writelane_b32 v0, s8, m0
-; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX942-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX942-NEXT:    v_add_f32_e32 v1, s8, v1
 ; GFX942-NEXT:    s_cbranch_scc1 .LBB28_5
 ; GFX942-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8440,15 +8437,14 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX11-NEXT:  .LBB28_5: ; %ComputeLoop
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_ctz_i32_b32 s1, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_readlane_b32 s6, v2, s1
-; GFX11-NEXT:    s_lshl_b32 s7, 1, s1
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 s0, s0, s7
 ; GFX11-NEXT:    v_writelane_b32 v0, s3, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_add_f32_e32 v1, s6, v1
-; GFX11-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX11-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX11-NEXT:    s_cbranch_scc1 .LBB28_5
 ; GFX11-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX11-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8528,11 +8524,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX10-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX10-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX10-NEXT:    v_readlane_b32 s6, v2, s1
-; GFX10-NEXT:    s_lshl_b32 s7, 1, s1
-; GFX10-NEXT:    s_andn2_b32 s0, s0, s7
 ; GFX10-NEXT:    v_writelane_b32 v0, s3, s1
 ; GFX10-NEXT:    v_add_f32_e32 v1, s6, v1
-; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX10-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX10-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX10-NEXT:    s_cbranch_scc1 .LBB28_5
 ; GFX10-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX10-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8609,14 +8604,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX90A-NEXT:  .LBB28_5: ; %ComputeLoop
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX90A-NEXT:    s_lshl_b64 s[6:7], 1, s3
-; GFX90A-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX90A-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX90A-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX90A-NEXT:    s_mov_b32 m0, s3
+; GFX90A-NEXT:    v_readlane_b32 s8, v2, s3
+; GFX90A-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX90A-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX90A-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX90A-NEXT:    v_writelane_b32 v0, s8, m0
-; GFX90A-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX90A-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX90A-NEXT:    v_add_f32_e32 v1, s8, v1
 ; GFX90A-NEXT:    s_cbranch_scc1 .LBB28_5
 ; GFX90A-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX90A-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8692,14 +8686,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX908-NEXT:  .LBB28_5: ; %ComputeLoop
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX908-NEXT:    s_lshl_b64 s[6:7], 1, s3
-; GFX908-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX908-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX908-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX908-NEXT:    s_mov_b32 m0, s3
+; GFX908-NEXT:    v_readlane_b32 s8, v2, s3
+; GFX908-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX908-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX908-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX908-NEXT:    v_writelane_b32 v0, s8, m0
-; GFX908-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX908-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX908-NEXT:    v_add_f32_e32 v1, s8, v1
 ; GFX908-NEXT:    s_cbranch_scc1 .LBB28_5
 ; GFX908-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX908-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8776,14 +8769,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX8-NEXT:  .LBB28_5: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
-; GFX8-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX8-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX8-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX8-NEXT:    s_mov_b32 m0, s3
+; GFX8-NEXT:    v_readlane_b32 s8, v2, s3
+; GFX8-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT:    v_writelane_b32 v0, s8, m0
-; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX8-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX8-NEXT:    v_add_f32_e32 v1, s8, v1
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB28_5
 ; GFX8-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9130,12 +9122,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_readlane_b32 s6, v1, s3
-; GFX12-NEXT:    s_lshl_b32 s7, 1, s3
 ; GFX12-NEXT:    v_writelane_b32 v0, s0, s3
+; GFX12-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_and_not1_b32 s1, s1, s7
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12-NEXT:    s_and_not1_b32 s1, s1, s3
 ; GFX12-NEXT:    s_add_f32 s0, s0, s6
 ; GFX12-NEXT:    s_cbranch_scc1 .LBB29_5
 ; GFX12-NEXT:  ; %bb.6: ; %ComputeEnd
@@ -9212,14 +9202,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX942-NEXT:  .LBB29_5: ; %ComputeLoop
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX942-NEXT:    s_lshl_b64 s[6:7], 1, s3
-; GFX942-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX942-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX942-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX942-NEXT:    s_mov_b32 m0, s3
+; GFX942-NEXT:    v_readlane_b32 s8, v2, s3
+; GFX942-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX942-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX942-NEXT:    v_writelane_b32 v0, s8, m0
-; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX942-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX942-NEXT:    v_add_f32_e32 v1, s8, v1
 ; GFX942-NEXT:    s_cbranch_scc1 .LBB29_5
 ; GFX942-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9296,15 +9285,14 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:  .LBB29_5: ; %ComputeLoop
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_ctz_i32_b32 s1, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_readlane_b32 s6, v2, s1
-; GFX11-NEXT:    s_lshl_b32 s7, 1, s1
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 s0, s0, s7
 ; GFX11-NEXT:    v_writelane_b32 v0, s3, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_add_f32_e32 v1, s6, v1
-; GFX11-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX11-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX11-NEXT:    s_cbranch_scc1 .LBB29_5
 ; GFX11-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX11-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9377,11 +9365,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX10-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX10-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX10-NEXT:    v_readlane_b32 s6, v2, s1
-; GFX10-NEXT:    s_lshl_b32 s7, 1, s1
-; GFX10-NEXT:    s_andn2_b32 s0, s0, s7
 ; GFX10-NEXT:    v_writelane_b32 v0, s3, s1
 ; GFX10-NEXT:    v_add_f32_e32 v1, s6, v1
-; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX10-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX10-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX10-NEXT:    s_cbranch_scc1 .LBB29_5
 ; GFX10-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX10-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9453,14 +9440,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX90A-NEXT:  .LBB29_5: ; %ComputeLoop
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX90A-NEXT:    s_lshl_b64 s[6:7], 1, s3
-; GFX90A-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX90A-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX90A-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX90A-NEXT:    s_mov_b32 m0, s3
+; GFX90A-NEXT:    v_readlane_b32 s8, v2, s3
+; GFX90A-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX90A-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX90A-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX90A-NEXT:    v_writelane_b32 v0, s8, m0
-; GFX90A-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX90A-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX90A-NEXT:    v_add_f32_e32 v1, s8, v1
 ; GFX90A-NEXT:    s_cbranch_scc1 .LBB29_5
 ; GFX90A-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX90A-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9533,14 +9519,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX908-NEXT:  .LBB29_5: ; %ComputeLoop
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX908-NEXT:    s_lshl_b64 s[6:7], 1, s3
-; GFX908-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX908-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX908-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX908-NEXT:    s_mov_b32 m0, s3
+; GFX908-NEXT:    v_readlane_b32 s8, v2, s3
+; GFX908-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX908-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX908-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX908-NEXT:    v_writelane_b32 v0, s8, m0
-; GFX908-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX908-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX908-NEXT:    v_add_f32_e32 v1, s8, v1
 ; GFX908-NEXT:    s_cbranch_scc1 .LBB29_5
 ; GFX908-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX908-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9614,14 +9599,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX8-NEXT:  .LBB29_5: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
-; GFX8-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX8-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX8-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX8-NEXT:    s_mov_b32 m0, s3
+; GFX8-NEXT:    v_readlane_b32 s8, v2, s3
+; GFX8-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT:    v_writelane_b32 v0, s8, m0
-; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX8-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX8-NEXT:    v_add_f32_e32 v1, s8, v1
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB29_5
 ; GFX8-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
index 1e4b633..fc36ed9 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
@@ -45,27 +45,18 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GFX9-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x10
-; GFX9-GISEL-NEXT:    s_mov_b32 s11, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, s11
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, s11
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s10, s1
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s1
 ; GFX9-GISEL-NEXT:    s_mov_b32 s5, s2
-; GFX9-GISEL-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
-; GFX9-GISEL-NEXT:    s_mov_b32 s10, s3
-; GFX9-GISEL-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-GISEL-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc slc
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
 ; GFX9-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x30
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, s11
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, s11
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s10, s1
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s1
 ; GFX9-GISEL-NEXT:    s_mov_b32 s5, s2
-; GFX9-GISEL-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
-; GFX9-GISEL-NEXT:    s_mov_b32 s10, s3
-; GFX9-GISEL-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen glc slc
@@ -105,27 +96,18 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX942-GISEL:       ; %bb.0: ; %entry
 ; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX942-GISEL-NEXT:    s_load_dword s11, s[4:5], 0x10
-; GFX942-GISEL-NEXT:    s_mov_b32 s7, 0
-; GFX942-GISEL-NEXT:    s_mov_b32 s8, s7
-; GFX942-GISEL-NEXT:    s_mov_b32 s10, s7
 ; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT:    s_mov_b32 s6, s1
+; GFX942-GISEL-NEXT:    s_mov_b32 s8, s1
 ; GFX942-GISEL-NEXT:    s_mov_b32 s9, s2
-; GFX942-GISEL-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
-; GFX942-GISEL-NEXT:    s_mov_b32 s6, s3
-; GFX942-GISEL-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
+; GFX942-GISEL-NEXT:    s_mov_b32 s10, s3
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX942-GISEL-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen nt
 ; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
-; GFX942-GISEL-NEXT:    s_load_dword s9, s[4:5], 0x30
-; GFX942-GISEL-NEXT:    s_mov_b32 s4, s7
-; GFX942-GISEL-NEXT:    s_mov_b32 s8, s7
+; GFX942-GISEL-NEXT:    s_load_dword s7, s[4:5], 0x30
 ; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT:    s_mov_b32 s6, s1
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, s1
 ; GFX942-GISEL-NEXT:    s_mov_b32 s5, s2
-; GFX942-GISEL-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
 ; GFX942-GISEL-NEXT:    s_mov_b32 s6, s3
-; GFX942-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen nt
@@ -168,29 +150,22 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_clause 0x1
 ; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX10-GISEL-NEXT:    s_load_dword s5, s[8:9], 0x10
-; GFX10-GISEL-NEXT:    s_mov_b32 s7, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s10, s7
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, s7
+; GFX10-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x10
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s6, s1
-; GFX10-GISEL-NEXT:    s_mov_b32 s11, s2
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-GISEL-NEXT:    s_or_b64 s[0:1], s[6:7], s[10:11]
+; GFX10-GISEL-NEXT:    s_mov_b32 s4, s1
+; GFX10-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX10-GISEL-NEXT:    s_mov_b32 s6, s3
-; GFX10-GISEL-NEXT:    s_or_b64 s[2:3], s[6:7], s[4:5]
-; GFX10-GISEL-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen slc
+; GFX10-GISEL-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen slc
 ; GFX10-GISEL-NEXT:    s_clause 0x1
-; GFX10-GISEL-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
-; GFX10-GISEL-NEXT:    s_load_dword s11, s[8:9], 0x30
+; GFX10-GISEL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x30
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s6, s1
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-GISEL-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
+; GFX10-GISEL-NEXT:    s_mov_b32 s4, s1
+; GFX10-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX10-GISEL-NEXT:    s_mov_b32 s6, s3
-; GFX10-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen glc slc
 ; GFX10-GISEL-NEXT:    s_endpgm
@@ -234,32 +209,21 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_clause 0x1
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x10
-; GFX11-GISEL-NEXT:    s_mov_b32 s9, 0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_mov_b32 s10, s9
-; GFX11-GISEL-NEXT:    s_mov_b32 s6, s9
+; GFX11-GISEL-NEXT:    s_load_b32 s11, s[4:5], 0x10
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
-; GFX11-GISEL-NEXT:    s_mov_b32 s11, s2
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-GISEL-NEXT:    s_or_b64 s[0:1], s[8:9], s[10:11]
-; GFX11-GISEL-NEXT:    s_mov_b32 s8, s3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
-; GFX11-GISEL-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen slc dlc
+; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
+; GFX11-GISEL-NEXT:    s_mov_b32 s9, s2
+; GFX11-GISEL-NEXT:    s_mov_b32 s10, s3
+; GFX11-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc
 ; GFX11-GISEL-NEXT:    s_clause 0x1
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX11-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
-; GFX11-GISEL-NEXT:    s_mov_b32 s4, s9
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
-; GFX11-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-GISEL-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
-; GFX11-GISEL-NEXT:    s_mov_b32 s8, s3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s1
+; GFX11-GISEL-NEXT:    s_mov_b32 s5, s2
+; GFX11-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    buffer_store_b32 v0, v1, s[4:7], 0 offen glc slc dlc
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -303,32 +267,21 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX12-GISEL:       ; %bb.0: ; %entry
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX12-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x10
-; GFX12-GISEL-NEXT:    s_mov_b32 s9, 0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    s_mov_b32 s10, s9
-; GFX12-GISEL-NEXT:    s_mov_b32 s6, s9
+; GFX12-GISEL-NEXT:    s_load_b32 s11, s[4:5], 0x10
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
-; GFX12-GISEL-NEXT:    s_mov_b32 s11, s2
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT:    s_or_b64 s[0:1], s[8:9], s[10:11]
-; GFX12-GISEL-NEXT:    s_mov_b32 s8, s3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
-; GFX12-GISEL-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen th:TH_LOAD_NT
+; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
+; GFX12-GISEL-NEXT:    s_mov_b32 s9, s2
+; GFX12-GISEL-NEXT:    s_mov_b32 s10, s3
+; GFX12-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX12-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
-; GFX12-GISEL-NEXT:    s_mov_b32 s4, s9
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
-; GFX12-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-GISEL-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
-; GFX12-GISEL-NEXT:    s_mov_b32 s8, s3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
+; GFX12-GISEL-NEXT:    s_mov_b32 s4, s1
+; GFX12-GISEL-NEXT:    s_mov_b32 s5, s2
+; GFX12-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -374,28 +327,19 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GFX9-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x10
-; GFX9-GISEL-NEXT:    s_mov_b32 s11, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, s11
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, s11
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s10, s1
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s1
 ; GFX9-GISEL-NEXT:    s_mov_b32 s5, s2
-; GFX9-GISEL-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
-; GFX9-GISEL-NEXT:    s_mov_b32 s10, s3
-; GFX9-GISEL-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-GISEL-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
 ; GFX9-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x30
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, s11
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, s11
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s10, s1
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s1
 ; GFX9-GISEL-NEXT:    s_mov_b32 s5, s2
-; GFX9-GISEL-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
-; GFX9-GISEL-NEXT:    s_mov_b32 s10, s3
-; GFX9-GISEL-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
@@ -436,28 +380,19 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX942-GISEL:       ; %bb.0: ; %entry
 ; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX942-GISEL-NEXT:    s_load_dword s11, s[4:5], 0x10
-; GFX942-GISEL-NEXT:    s_mov_b32 s7, 0
-; GFX942-GISEL-NEXT:    s_mov_b32 s8, s7
-; GFX942-GISEL-NEXT:    s_mov_b32 s10, s7
 ; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT:    s_mov_b32 s6, s1
+; GFX942-GISEL-NEXT:    s_mov_b32 s8, s1
 ; GFX942-GISEL-NEXT:    s_mov_b32 s9, s2
-; GFX942-GISEL-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
-; GFX942-GISEL-NEXT:    s_mov_b32 s6, s3
-; GFX942-GISEL-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
+; GFX942-GISEL-NEXT:    s_mov_b32 s10, s3
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX942-GISEL-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1
 ; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
-; GFX942-GISEL-NEXT:    s_load_dword s9, s[4:5], 0x30
-; GFX942-GISEL-NEXT:    s_mov_b32 s4, s7
-; GFX942-GISEL-NEXT:    s_mov_b32 s8, s7
+; GFX942-GISEL-NEXT:    s_load_dword s7, s[4:5], 0x30
 ; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT:    s_mov_b32 s6, s1
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, s1
 ; GFX942-GISEL-NEXT:    s_mov_b32 s5, s2
-; GFX942-GISEL-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
 ; GFX942-GISEL-NEXT:    s_mov_b32 s6, s3
-; GFX942-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX942-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1
 ; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
@@ -501,30 +436,23 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_clause 0x1
 ; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX10-GISEL-NEXT:    s_load_dword s5, s[8:9], 0x10
-; GFX10-GISEL-NEXT:    s_mov_b32 s7, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s10, s7
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, s7
+; GFX10-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x10
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s6, s1
-; GFX10-GISEL-NEXT:    s_mov_b32 s11, s2
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-GISEL-NEXT:    s_or_b64 s[0:1], s[6:7], s[10:11]
+; GFX10-GISEL-NEXT:    s_mov_b32 s4, s1
+; GFX10-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX10-GISEL-NEXT:    s_mov_b32 s6, s3
-; GFX10-GISEL-NEXT:    s_or_b64 s[2:3], s[6:7], s[4:5]
-; GFX10-GISEL-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen glc dlc
+; GFX10-GISEL-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc dlc
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    s_clause 0x1
-; GFX10-GISEL-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
-; GFX10-GISEL-NEXT:    s_load_dword s11, s[8:9], 0x30
+; GFX10-GISEL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x30
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s6, s1
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-GISEL-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
+; GFX10-GISEL-NEXT:    s_mov_b32 s4, s1
+; GFX10-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX10-GISEL-NEXT:    s_mov_b32 s6, s3
-; GFX10-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
 ; GFX10-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
 ; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-GISEL-NEXT:    s_endpgm
@@ -569,33 +497,22 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_clause 0x1
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x10
-; GFX11-GISEL-NEXT:    s_mov_b32 s9, 0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_mov_b32 s10, s9
-; GFX11-GISEL-NEXT:    s_mov_b32 s6, s9
+; GFX11-GISEL-NEXT:    s_load_b32 s11, s[4:5], 0x10
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
-; GFX11-GISEL-NEXT:    s_mov_b32 s11, s2
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-GISEL-NEXT:    s_or_b64 s[0:1], s[8:9], s[10:11]
-; GFX11-GISEL-NEXT:    s_mov_b32 s8, s3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
-; GFX11-GISEL-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen glc dlc
+; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
+; GFX11-GISEL-NEXT:    s_mov_b32 s9, s2
+; GFX11-GISEL-NEXT:    s_mov_b32 s10, s3
+; GFX11-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_clause 0x1
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX11-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
-; GFX11-GISEL-NEXT:    s_mov_b32 s4, s9
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
-; GFX11-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-GISEL-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
-; GFX11-GISEL-NEXT:    s_mov_b32 s8, s3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s1
+; GFX11-GISEL-NEXT:    s_mov_b32 s5, s2
+; GFX11-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX11-GISEL-NEXT:    buffer_store_b32 v0, v1, s[4:7], 0 offen dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -640,33 +557,22 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX12-GISEL:       ; %bb.0: ; %entry
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX12-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x10
-; GFX12-GISEL-NEXT:    s_mov_b32 s9, 0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    s_mov_b32 s10, s9
-; GFX12-GISEL-NEXT:    s_mov_b32 s6, s9
+; GFX12-GISEL-NEXT:    s_load_b32 s11, s[4:5], 0x10
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
-; GFX12-GISEL-NEXT:    s_mov_b32 s11, s2
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT:    s_or_b64 s[0:1], s[8:9], s[10:11]
-; GFX12-GISEL-NEXT:    s_mov_b32 s8, s3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
-; GFX12-GISEL-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
+; GFX12-GISEL-NEXT:    s_mov_b32 s9, s2
+; GFX12-GISEL-NEXT:    s_mov_b32 s10, s3
+; GFX12-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX12-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
-; GFX12-GISEL-NEXT:    s_mov_b32 s4, s9
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
-; GFX12-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-GISEL-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
-; GFX12-GISEL-NEXT:    s_mov_b32 s8, s3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
+; GFX12-GISEL-NEXT:    s_mov_b32 s4, s1
+; GFX12-GISEL-NEXT:    s_mov_b32 s5, s2
+; GFX12-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX12-GISEL-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll
index d578d2e..60570bd 100644
--- a/llvm/test/CodeGen/AMDGPU/minmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/minmax.ll
@@ -1296,4 +1296,4 @@ declare half @llvm.minnum.f16(half, half)
 declare half @llvm.maxnum.f16(half, half)
 declare float @llvm.minnum.f32(float, float)
 declare float @llvm.maxnum.f32(float, float)
-attributes #0 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" }
+attributes #0 = { nounwind "no-nans-fp-math"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
index c1cf06e..fba42c4 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
+++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
@@ -388,9 +388,8 @@ body:             |
   ; GCN-NEXT:   liveins: $sgpr0, $vgpr0_vgpr1
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-  ; GCN-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY]], implicit-def $scc
-  ; GCN-NEXT:   S_NOP 0, implicit killed $scc
-  ; GCN-NEXT:   S_CMP_EQ_U32 killed [[S_AND_B32_]], 1, implicit-def $scc
+  ; GCN-NEXT:   S_BITCMP1_B32 killed [[COPY]], 0, implicit-def $scc
+  ; GCN-NEXT:   S_NOP 0, implicit $scc
   ; GCN-NEXT:   S_CBRANCH_SCC0 %bb.2, implicit $scc
   ; GCN-NEXT:   S_BRANCH %bb.1
   ; GCN-NEXT: {{  $}}
@@ -417,6 +416,80 @@ body:             |
     S_ENDPGM 0
 
 ...
+---
+name:            xor_1_cmp_lg_0_killed_scc
+body:             |
+  ; GCN-LABEL: name: xor_1_cmp_lg_0_killed_scc
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT:   liveins: $sgpr0, $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GCN-NEXT:   [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 1, killed [[COPY]], implicit-def $scc
+  ; GCN-NEXT:   S_NOP 0, implicit $scc
+  ; GCN-NEXT:   S_CBRANCH_SCC0 %bb.2, implicit $scc
+  ; GCN-NEXT:   S_BRANCH %bb.1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $sgpr0, $vgpr0_vgpr1
+
+    %0:sreg_32 = COPY $sgpr0
+    %1:sreg_32 = S_XOR_B32 1, killed %0, implicit-def $scc
+    S_NOP 0, implicit killed $scc
+    S_CMP_LG_U32 killed %1:sreg_32, 0, implicit-def $scc
+    S_CBRANCH_SCC0 %bb.2, implicit $scc
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+
+  bb.2:
+    S_ENDPGM 0
+
+...
+---
+name:            absdiff_1_cmp_lg_0_killed_scc
+body:             |
+  ; GCN-LABEL: name: absdiff_1_cmp_lg_0_killed_scc
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT:   liveins: $sgpr0, $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GCN-NEXT:   [[S_ABSDIFF_I32_:%[0-9]+]]:sreg_32 = S_ABSDIFF_I32 1, killed [[COPY]], implicit-def $scc
+  ; GCN-NEXT:   S_NOP 0, implicit $scc
+  ; GCN-NEXT:   S_CBRANCH_SCC0 %bb.2, implicit $scc
+  ; GCN-NEXT:   S_BRANCH %bb.1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $sgpr0, $vgpr0_vgpr1
+
+    %0:sreg_32 = COPY $sgpr0
+    %1:sreg_32 = S_ABSDIFF_I32 1, killed %0, implicit-def $scc
+    S_NOP 0, implicit killed $scc
+    S_CMP_LG_U32 killed %1:sreg_32, 0, implicit-def $scc
+    S_CBRANCH_SCC0 %bb.2, implicit $scc
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+
+  bb.2:
+    S_ENDPGM 0
+
+...
 
 ---
 name:            and_1_cmp_eq_1_clobbered_scc
@@ -2070,8 +2143,7 @@ body:             |
   ; GCN-NEXT:   liveins: $sgpr0, $vgpr0_vgpr1
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-  ; GCN-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 3, killed [[COPY]], implicit-def dead $scc
-  ; GCN-NEXT:   S_CMP_LG_U32 killed [[S_AND_B32_]], 0, implicit-def $scc
+  ; GCN-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 3, killed [[COPY]], implicit-def $scc
   ; GCN-NEXT:   S_CBRANCH_SCC0 %bb.2, implicit $scc
   ; GCN-NEXT:   S_BRANCH %bb.1
   ; GCN-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
index f53aaaa..dd5f838 100644
--- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s
 
 declare i32 @llvm.ctpop.i32(i32)
 declare i64 @llvm.ctpop.i64(i64)
@@ -10,7 +10,6 @@ define amdgpu_ps i32 @shl32(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: shl32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_lshl_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -25,7 +24,6 @@ define amdgpu_ps i32 @shl64(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: shl64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -40,7 +38,6 @@ define amdgpu_ps i32 @lshr32(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: lshr32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_lshr_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -55,7 +52,6 @@ define amdgpu_ps i32 @lshr64(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: lshr64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -70,7 +66,6 @@ define amdgpu_ps i32 @ashr32(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: ashr32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_ashr_i32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -85,7 +80,6 @@ define amdgpu_ps i32 @ashr64(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: ashr64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_ashr_i64 s[0:1], s[0:1], s2
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -100,7 +94,6 @@ define amdgpu_ps i32 @abs32(i32 inreg %val0) {
 ; CHECK-LABEL: abs32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_abs_i32 s0, s0
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s0
 ; CHECK-NEXT:    ;;#ASMEND
@@ -121,7 +114,6 @@ define amdgpu_ps i32 @and32(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: and32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_and_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -136,7 +128,6 @@ define amdgpu_ps i32 @and64(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: and64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -151,7 +142,6 @@ define amdgpu_ps i32 @or32(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: or32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_or_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -166,7 +156,6 @@ define amdgpu_ps i32 @or64(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: or64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -181,7 +170,6 @@ define amdgpu_ps i32 @xor32(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: xor32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_xor_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -196,7 +184,6 @@ define amdgpu_ps i32 @xor64(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: xor64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -211,7 +198,6 @@ define amdgpu_ps i32 @nand32(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: nand32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_nand_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s0
 ; CHECK-NEXT:    ;;#ASMEND
@@ -231,7 +217,6 @@ define amdgpu_ps i32 @nand64(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: nand64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_nand_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
@@ -251,7 +236,6 @@ define amdgpu_ps i32 @nor32(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: nor32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_nor_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s0
 ; CHECK-NEXT:    ;;#ASMEND
@@ -271,7 +255,6 @@ define amdgpu_ps i32 @nor64(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: nor64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_nor_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
@@ -291,7 +274,6 @@ define amdgpu_ps i32 @xnor32(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: xnor32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_xnor_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s0
 ; CHECK-NEXT:    ;;#ASMEND
@@ -311,7 +293,6 @@ define amdgpu_ps i32 @xnor64(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: xnor64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_xnor_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
@@ -331,7 +312,6 @@ define amdgpu_ps i32 @andn232(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: andn232:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_andn2_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -347,7 +327,6 @@ define amdgpu_ps i32 @nandn264(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: nandn264:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -363,7 +342,6 @@ define amdgpu_ps i32 @orn232(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: orn232:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_orn2_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -379,7 +357,6 @@ define amdgpu_ps i32 @orn264(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: orn264:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_orn2_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -395,7 +372,6 @@ define amdgpu_ps i32 @bfe_i32(i32 inreg %val0) {
 ; CHECK-LABEL: bfe_i32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_bfe_i32 s0, s0, 0x80010
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -433,7 +409,6 @@ define amdgpu_ps i32 @bfe_u32(i32 inreg %val0) {
 ; CHECK-LABEL: bfe_u32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_bfe_u32 s0, s0, 0x80010
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -513,7 +488,6 @@ define amdgpu_ps i32 @bcnt132(i32 inreg %val0) {
 ; CHECK-LABEL: bcnt132:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_bcnt1_i32_b32 s0, s0
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s0
 ; CHECK-NEXT:    ;;#ASMEND
@@ -552,7 +526,6 @@ define amdgpu_ps i32 @quadmask32(i32 inreg %val0) {
 ; CHECK-LABEL: quadmask32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_quadmask_b32 s0, s0
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s0
 ; CHECK-NEXT:    ;;#ASMEND
@@ -571,7 +544,6 @@ define amdgpu_ps i32 @quadmask64(i64 inreg %val0) {
 ; CHECK-LABEL: quadmask64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_quadmask_b64 s[0:1], s[0:1]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
@@ -590,7 +562,6 @@ define amdgpu_ps i32 @not32(i32 inreg %val0) {
 ; CHECK-LABEL: not32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_not_b32 s0, s0
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s0
 ; CHECK-NEXT:    ;;#ASMEND
@@ -609,7 +580,6 @@ define amdgpu_ps i32 @not64(i64 inreg %val0) {
 ; CHECK-LABEL: not64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_not_b64 s[0:1], s[0:1]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
@@ -623,3 +593,35 @@ define amdgpu_ps i32 @not64(i64 inreg %val0) {
   %zext = zext i1 %cmp to i32
   ret i32 %zext
 }
+
+
+; --------------------------------------------------------------------------------
+; Negative tests
+; --------------------------------------------------------------------------------
+
+@1 = extern_weak dso_local addrspace(4) constant i32
+
+define amdgpu_ps i32 @si_pc_add_rel_offset_must_not_optimize() {
+; CHECK-LABEL: si_pc_add_rel_offset_must_not_optimize:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_getpc_b64 s[0:1]
+; CHECK-NEXT:    s_add_u32 s0, s0, __unnamed_1@rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s1, s1, __unnamed_1@rel32@hi+12
+; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT:    s_cbranch_scc0 .LBB35_2
+; CHECK-NEXT:  ; %bb.1: ; %endif
+; CHECK-NEXT:    s_mov_b32 s0, 1
+; CHECK-NEXT:    s_branch .LBB35_3
+; CHECK-NEXT:  .LBB35_2: ; %if
+; CHECK-NEXT:    s_mov_b32 s0, 0
+; CHECK-NEXT:    s_branch .LBB35_3
+; CHECK-NEXT:  .LBB35_3:
+  %cmp = icmp ne ptr addrspace(4) @1, null
+  br i1 %cmp, label %endif, label %if
+
+if:
+  ret i32 0
+
+endif:
+  ret i32 1
+}
diff --git a/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll b/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll
index a828ee0..7552f6b 100644
--- a/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll
@@ -12,8 +12,6 @@ define amdgpu_ps i32 @s_uaddo_pseudo(i32 inreg %val0) {
 ; CHECK-LABEL: s_uaddo_pseudo:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 s0, s0, 1
-; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_addc_u32 s0, 1, 0
 ; CHECK-NEXT:    ; return to shader part epilog
   %pair = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %val0, i32 1)
@@ -32,8 +30,6 @@ define amdgpu_ps i32 @s_usubo_pseudo(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: s_usubo_pseudo:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_sub_u32 s0, s0, 1
-; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; CHECK-NEXT:    s_subb_u32 s0, s1, 0
 ; CHECK-NEXT:    ; return to shader part epilog
   %pair = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %val0, i32 1)
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 5f6d622..71f5a94 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -56,10 +56,9 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_addc_u32 s15, 0, s16
 ; GCN-NEXT:    s_add_u32 s16, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s16
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s12, v0
+; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
 ; GCN-NEXT:    s_addc_u32 s14, s14, s15
 ; GCN-NEXT:    s_mul_i32 s0, s12, s14
 ; GCN-NEXT:    v_readfirstlane_b32 s1, v0
@@ -90,7 +89,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_add_u32 s15, s16, s0
 ; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
 ; GCN-NEXT:    s_addc_u32 s14, s14, s12
 ; GCN-NEXT:    s_ashr_i32 s12, s7, 31
 ; GCN-NEXT:    s_add_u32 s0, s6, s12
@@ -116,52 +114,50 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_readfirstlane_b32 s4, v0
 ; GCN-NEXT:    s_addc_u32 s4, s4, 0
 ; GCN-NEXT:    s_mul_i32 s14, s7, s14
-; GCN-NEXT:    s_add_u32 s14, s1, s14
-; GCN-NEXT:    v_mov_b32_e32 v0, s14
+; GCN-NEXT:    s_add_u32 s16, s1, s14
+; GCN-NEXT:    v_mov_b32_e32 v0, s16
 ; GCN-NEXT:    v_mul_hi_u32 v0, s10, v0
-; GCN-NEXT:    s_addc_u32 s15, 0, s4
+; GCN-NEXT:    s_addc_u32 s17, 0, s4
 ; GCN-NEXT:    s_mov_b32 s1, s5
-; GCN-NEXT:    s_mul_i32 s4, s10, s15
+; GCN-NEXT:    s_mul_i32 s4, s10, s17
 ; GCN-NEXT:    v_readfirstlane_b32 s5, v0
 ; GCN-NEXT:    s_add_i32 s4, s5, s4
-; GCN-NEXT:    s_mul_i32 s5, s11, s14
-; GCN-NEXT:    s_add_i32 s16, s4, s5
-; GCN-NEXT:    s_sub_i32 s17, s7, s16
-; GCN-NEXT:    s_mul_i32 s4, s10, s14
+; GCN-NEXT:    s_mul_i32 s5, s11, s16
+; GCN-NEXT:    s_add_i32 s18, s4, s5
+; GCN-NEXT:    s_sub_i32 s14, s7, s18
+; GCN-NEXT:    s_mul_i32 s4, s10, s16
 ; GCN-NEXT:    s_sub_u32 s6, s6, s4
 ; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s18, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s18, 0
-; GCN-NEXT:    s_subb_u32 s17, s17, s11
-; GCN-NEXT:    s_sub_u32 s19, s6, s10
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT:    s_or_b32 s15, s4, s5
+; GCN-NEXT:    s_subb_u32 s19, s14, s11
+; GCN-NEXT:    s_sub_u32 s20, s6, s10
+; GCN-NEXT:    s_cselect_b64 s[14:15], -1, 0
+; GCN-NEXT:    s_or_b32 s14, s14, s15
+; GCN-NEXT:    s_subb_u32 s14, s19, 0
+; GCN-NEXT:    s_cmp_ge_u32 s14, s11
+; GCN-NEXT:    s_cselect_b32 s15, -1, 0
+; GCN-NEXT:    s_cmp_ge_u32 s20, s10
+; GCN-NEXT:    s_cselect_b32 s19, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s14, s11
+; GCN-NEXT:    s_cselect_b32 s14, s19, s15
+; GCN-NEXT:    s_add_u32 s15, s16, 1
+; GCN-NEXT:    s_addc_u32 s19, s17, 0
+; GCN-NEXT:    s_add_u32 s20, s16, 2
+; GCN-NEXT:    s_addc_u32 s21, s17, 0
+; GCN-NEXT:    s_cmp_lg_u32 s14, 0
+; GCN-NEXT:    s_cselect_b32 s14, s20, s15
+; GCN-NEXT:    s_cselect_b32 s15, s21, s19
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_subb_u32 s4, s17, 0
+; GCN-NEXT:    s_subb_u32 s4, s7, s18
 ; GCN-NEXT:    s_cmp_ge_u32 s4, s11
 ; GCN-NEXT:    s_cselect_b32 s5, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s19, s10
-; GCN-NEXT:    s_cselect_b32 s17, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s4, s11
-; GCN-NEXT:    s_cselect_b32 s4, s17, s5
-; GCN-NEXT:    s_add_u32 s5, s14, 1
-; GCN-NEXT:    s_addc_u32 s17, s15, 0
-; GCN-NEXT:    s_add_u32 s19, s14, 2
-; GCN-NEXT:    s_addc_u32 s20, s15, 0
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_cselect_b32 s4, s19, s5
-; GCN-NEXT:    s_cselect_b32 s5, s20, s17
-; GCN-NEXT:    s_cmp_lg_u32 s18, 0
-; GCN-NEXT:    s_subb_u32 s7, s7, s16
-; GCN-NEXT:    s_cmp_ge_u32 s7, s11
-; GCN-NEXT:    s_cselect_b32 s16, -1, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s6, s10
 ; GCN-NEXT:    s_cselect_b32 s6, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s7, s11
-; GCN-NEXT:    s_cselect_b32 s6, s6, s16
-; GCN-NEXT:    s_cmp_lg_u32 s6, 0
-; GCN-NEXT:    s_cselect_b32 s5, s5, s15
-; GCN-NEXT:    s_cselect_b32 s4, s4, s14
+; GCN-NEXT:    s_cmp_eq_u32 s4, s11
+; GCN-NEXT:    s_cselect_b32 s4, s6, s5
+; GCN-NEXT:    s_cmp_lg_u32 s4, 0
+; GCN-NEXT:    s_cselect_b32 s5, s15, s17
+; GCN-NEXT:    s_cselect_b32 s4, s14, s16
 ; GCN-NEXT:    s_xor_b64 s[6:7], s[12:13], s[8:9]
 ; GCN-NEXT:    s_xor_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-NEXT:    s_sub_u32 s4, s4, s6
@@ -208,7 +204,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_add_u32 s18, s16, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s10, s10, s11
-; GCN-IR-NEXT:    s_cmp_lg_u32 s10, 0
 ; GCN-IR-NEXT:    s_addc_u32 s10, s17, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s16, 63, s16
@@ -242,7 +237,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_add_u32 s14, s14, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[20:21], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s20, s20, s21
-; GCN-IR-NEXT:    s_cmp_lg_u32 s20, 0
 ; GCN-IR-NEXT:    s_addc_u32 s15, s15, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[20:21], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[12:13], s[8:9]
@@ -1195,10 +1189,9 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_addc_u32 s12, 0, s13
 ; GCN-NEXT:    s_add_u32 s13, s8, s9
 ; GCN-NEXT:    v_mov_b32_e32 v0, s13
-; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
 ; GCN-NEXT:    s_addc_u32 s11, s11, s12
 ; GCN-NEXT:    s_mul_i32 s8, s2, s11
 ; GCN-NEXT:    v_readfirstlane_b32 s9, v0
@@ -1229,7 +1222,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_add_u32 s2, s13, s2
 ; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
 ; GCN-NEXT:    s_addc_u32 s8, s11, s10
 ; GCN-NEXT:    v_mul_hi_u32 v1, s2, 24
 ; GCN-NEXT:    v_mul_hi_u32 v0, s8, 24
@@ -1238,48 +1230,46 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    v_readfirstlane_b32 s10, v1
 ; GCN-NEXT:    v_readfirstlane_b32 s9, v0
 ; GCN-NEXT:    s_add_u32 s8, s10, s8
-; GCN-NEXT:    s_addc_u32 s10, 0, s9
-; GCN-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-NEXT:    s_addc_u32 s12, 0, s9
+; GCN-NEXT:    v_mov_b32_e32 v0, s12
 ; GCN-NEXT:    v_mul_hi_u32 v0, s6, v0
-; GCN-NEXT:    s_mul_i32 s8, s7, s10
+; GCN-NEXT:    s_mul_i32 s8, s7, s12
 ; GCN-NEXT:    v_readfirstlane_b32 s9, v0
-; GCN-NEXT:    s_add_i32 s11, s9, s8
-; GCN-NEXT:    s_sub_i32 s12, 0, s11
-; GCN-NEXT:    s_mul_i32 s8, s6, s10
-; GCN-NEXT:    s_sub_u32 s13, 24, s8
-; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT:    s_or_b32 s14, s8, s9
-; GCN-NEXT:    s_cmp_lg_u32 s14, 0
-; GCN-NEXT:    s_subb_u32 s12, s12, s7
-; GCN-NEXT:    s_sub_u32 s15, s13, s6
+; GCN-NEXT:    s_add_i32 s13, s9, s8
+; GCN-NEXT:    s_sub_i32 s10, 0, s13
+; GCN-NEXT:    s_mul_i32 s8, s6, s12
+; GCN-NEXT:    s_sub_u32 s14, 24, s8
 ; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT:    s_or_b32 s11, s8, s9
+; GCN-NEXT:    s_subb_u32 s15, s10, s7
+; GCN-NEXT:    s_sub_u32 s16, s14, s6
+; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT:    s_or_b32 s10, s10, s11
+; GCN-NEXT:    s_subb_u32 s10, s15, 0
+; GCN-NEXT:    s_cmp_ge_u32 s10, s7
+; GCN-NEXT:    s_cselect_b32 s11, -1, 0
+; GCN-NEXT:    s_cmp_ge_u32 s16, s6
+; GCN-NEXT:    s_cselect_b32 s15, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s10, s7
+; GCN-NEXT:    s_cselect_b32 s10, s15, s11
+; GCN-NEXT:    s_add_u32 s11, s12, 1
+; GCN-NEXT:    s_addc_u32 s15, 0, 0
+; GCN-NEXT:    s_add_u32 s16, s12, 2
+; GCN-NEXT:    s_addc_u32 s17, 0, 0
+; GCN-NEXT:    s_cmp_lg_u32 s10, 0
+; GCN-NEXT:    s_cselect_b32 s10, s16, s11
+; GCN-NEXT:    s_cselect_b32 s11, s17, s15
 ; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    s_subb_u32 s8, s12, 0
+; GCN-NEXT:    s_subb_u32 s8, 0, s13
 ; GCN-NEXT:    s_cmp_ge_u32 s8, s7
 ; GCN-NEXT:    s_cselect_b32 s9, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s15, s6
-; GCN-NEXT:    s_cselect_b32 s12, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s8, s7
-; GCN-NEXT:    s_cselect_b32 s8, s12, s9
-; GCN-NEXT:    s_add_u32 s9, s10, 1
-; GCN-NEXT:    s_addc_u32 s12, 0, 0
-; GCN-NEXT:    s_add_u32 s15, s10, 2
-; GCN-NEXT:    s_addc_u32 s16, 0, 0
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    s_cselect_b32 s8, s15, s9
-; GCN-NEXT:    s_cselect_b32 s9, s16, s12
-; GCN-NEXT:    s_cmp_lg_u32 s14, 0
-; GCN-NEXT:    s_subb_u32 s11, 0, s11
-; GCN-NEXT:    s_cmp_ge_u32 s11, s7
-; GCN-NEXT:    s_cselect_b32 s12, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s13, s6
+; GCN-NEXT:    s_cmp_ge_u32 s14, s6
 ; GCN-NEXT:    s_cselect_b32 s6, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s11, s7
-; GCN-NEXT:    s_cselect_b32 s6, s6, s12
+; GCN-NEXT:    s_cmp_eq_u32 s8, s7
+; GCN-NEXT:    s_cselect_b32 s6, s6, s9
 ; GCN-NEXT:    s_cmp_lg_u32 s6, 0
-; GCN-NEXT:    s_cselect_b32 s7, s9, 0
-; GCN-NEXT:    s_cselect_b32 s6, s8, s10
+; GCN-NEXT:    s_cselect_b32 s7, s11, 0
+; GCN-NEXT:    s_cselect_b32 s6, s10, s12
 ; GCN-NEXT:    s_xor_b64 s[6:7], s[6:7], s[4:5]
 ; GCN-NEXT:    s_sub_u32 s6, s6, s4
 ; GCN-NEXT:    s_subb_u32 s7, s7, s4
@@ -1315,7 +1305,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s12, s10, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s8, s8, s9
-; GCN-IR-NEXT:    s_cmp_lg_u32 s8, 0
 ; GCN-IR-NEXT:    s_addc_u32 s8, s11, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s10, 63, s10
@@ -1348,7 +1337,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s16, s16, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s18, s18, s19
-; GCN-IR-NEXT:    s_cmp_lg_u32 s18, 0
 ; GCN-IR-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll
index bbd1793..e12e31b 100644
--- a/llvm/test/CodeGen/AMDGPU/srem.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem.ll
@@ -1513,7 +1513,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
 ; GCN-NEXT:    s_sub_u32 s3, 0, s8
-; GCN-NEXT:    s_subb_u32 s12, 0, s9
+; GCN-NEXT:    s_subb_u32 s10, 0, s9
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1522,56 +1522,52 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s13, v1
-; GCN-NEXT:    v_readfirstlane_b32 s10, v0
-; GCN-NEXT:    s_mul_i32 s11, s3, s13
-; GCN-NEXT:    s_mul_hi_u32 s15, s3, s10
-; GCN-NEXT:    s_mul_i32 s14, s12, s10
-; GCN-NEXT:    s_add_i32 s11, s15, s11
-; GCN-NEXT:    s_add_i32 s11, s11, s14
-; GCN-NEXT:    s_mul_i32 s16, s3, s10
-; GCN-NEXT:    s_mul_i32 s15, s10, s11
-; GCN-NEXT:    s_mul_hi_u32 s17, s10, s16
-; GCN-NEXT:    s_mul_hi_u32 s14, s10, s11
+; GCN-NEXT:    v_readfirstlane_b32 s11, v1
+; GCN-NEXT:    v_readfirstlane_b32 s12, v0
+; GCN-NEXT:    s_mul_i32 s13, s3, s11
+; GCN-NEXT:    s_mul_hi_u32 s15, s3, s12
+; GCN-NEXT:    s_mul_i32 s14, s10, s12
+; GCN-NEXT:    s_add_i32 s13, s15, s13
+; GCN-NEXT:    s_add_i32 s13, s13, s14
+; GCN-NEXT:    s_mul_i32 s16, s3, s12
+; GCN-NEXT:    s_mul_i32 s15, s12, s13
+; GCN-NEXT:    s_mul_hi_u32 s17, s12, s16
+; GCN-NEXT:    s_mul_hi_u32 s14, s12, s13
 ; GCN-NEXT:    s_add_u32 s15, s17, s15
 ; GCN-NEXT:    s_addc_u32 s14, 0, s14
-; GCN-NEXT:    s_mul_hi_u32 s18, s13, s16
-; GCN-NEXT:    s_mul_i32 s16, s13, s16
+; GCN-NEXT:    s_mul_hi_u32 s18, s11, s16
+; GCN-NEXT:    s_mul_i32 s16, s11, s16
 ; GCN-NEXT:    s_add_u32 s15, s15, s16
-; GCN-NEXT:    s_mul_hi_u32 s17, s13, s11
+; GCN-NEXT:    s_mul_hi_u32 s17, s11, s13
 ; GCN-NEXT:    s_addc_u32 s14, s14, s18
 ; GCN-NEXT:    s_addc_u32 s15, s17, 0
-; GCN-NEXT:    s_mul_i32 s11, s13, s11
-; GCN-NEXT:    s_add_u32 s11, s14, s11
+; GCN-NEXT:    s_mul_i32 s13, s11, s13
+; GCN-NEXT:    s_add_u32 s13, s14, s13
 ; GCN-NEXT:    s_addc_u32 s14, 0, s15
-; GCN-NEXT:    s_add_u32 s15, s10, s11
-; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GCN-NEXT:    s_addc_u32 s13, s13, s14
-; GCN-NEXT:    s_mul_i32 s10, s3, s13
-; GCN-NEXT:    s_mul_hi_u32 s11, s3, s15
-; GCN-NEXT:    s_add_i32 s10, s11, s10
-; GCN-NEXT:    s_mul_i32 s12, s12, s15
-; GCN-NEXT:    s_add_i32 s10, s10, s12
-; GCN-NEXT:    s_mul_i32 s3, s3, s15
-; GCN-NEXT:    s_mul_hi_u32 s12, s13, s3
-; GCN-NEXT:    s_mul_i32 s14, s13, s3
-; GCN-NEXT:    s_mul_i32 s17, s15, s10
-; GCN-NEXT:    s_mul_hi_u32 s3, s15, s3
-; GCN-NEXT:    s_mul_hi_u32 s16, s15, s10
+; GCN-NEXT:    s_add_u32 s12, s12, s13
+; GCN-NEXT:    s_addc_u32 s11, s11, s14
+; GCN-NEXT:    s_mul_i32 s13, s3, s11
+; GCN-NEXT:    s_mul_hi_u32 s14, s3, s12
+; GCN-NEXT:    s_add_i32 s13, s14, s13
+; GCN-NEXT:    s_mul_i32 s10, s10, s12
+; GCN-NEXT:    s_add_i32 s13, s13, s10
+; GCN-NEXT:    s_mul_i32 s3, s3, s12
+; GCN-NEXT:    s_mul_hi_u32 s14, s11, s3
+; GCN-NEXT:    s_mul_i32 s15, s11, s3
+; GCN-NEXT:    s_mul_i32 s17, s12, s13
+; GCN-NEXT:    s_mul_hi_u32 s3, s12, s3
+; GCN-NEXT:    s_mul_hi_u32 s16, s12, s13
 ; GCN-NEXT:    s_add_u32 s3, s3, s17
 ; GCN-NEXT:    s_addc_u32 s16, 0, s16
-; GCN-NEXT:    s_add_u32 s3, s3, s14
-; GCN-NEXT:    s_mul_hi_u32 s11, s13, s10
-; GCN-NEXT:    s_addc_u32 s3, s16, s12
-; GCN-NEXT:    s_addc_u32 s11, s11, 0
-; GCN-NEXT:    s_mul_i32 s10, s13, s10
-; GCN-NEXT:    s_add_u32 s3, s3, s10
-; GCN-NEXT:    s_addc_u32 s12, 0, s11
-; GCN-NEXT:    s_add_u32 s3, s15, s3
-; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GCN-NEXT:    s_addc_u32 s14, s13, s12
+; GCN-NEXT:    s_add_u32 s3, s3, s15
+; GCN-NEXT:    s_mul_hi_u32 s10, s11, s13
+; GCN-NEXT:    s_addc_u32 s3, s16, s14
+; GCN-NEXT:    s_addc_u32 s10, s10, 0
+; GCN-NEXT:    s_mul_i32 s13, s11, s13
+; GCN-NEXT:    s_add_u32 s3, s3, s13
+; GCN-NEXT:    s_addc_u32 s10, 0, s10
+; GCN-NEXT:    s_add_u32 s3, s12, s3
+; GCN-NEXT:    s_addc_u32 s14, s11, s10
 ; GCN-NEXT:    s_ashr_i32 s10, s5, 31
 ; GCN-NEXT:    s_add_u32 s12, s4, s10
 ; GCN-NEXT:    s_mov_b32 s11, s10
@@ -1600,11 +1596,9 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GCN-NEXT:    s_mul_i32 s3, s8, s3
 ; GCN-NEXT:    s_sub_u32 s3, s12, s3
 ; GCN-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; GCN-NEXT:    s_subb_u32 s12, s16, s9
 ; GCN-NEXT:    s_sub_u32 s18, s3, s8
 ; GCN-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; GCN-NEXT:    s_subb_u32 s19, s12, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s19, s9
 ; GCN-NEXT:    s_cselect_b32 s20, -1, 0
@@ -1614,12 +1608,10 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GCN-NEXT:    s_cselect_b32 s20, s21, s20
 ; GCN-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; GCN-NEXT:    s_subb_u32 s12, s12, s9
-; GCN-NEXT:    s_sub_u32 s21, s18, s8
-; GCN-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[16:17], 0
+; GCN-NEXT:    s_sub_u32 s16, s18, s8
 ; GCN-NEXT:    s_subb_u32 s12, s12, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s20, 0
-; GCN-NEXT:    s_cselect_b32 s16, s21, s18
+; GCN-NEXT:    s_cselect_b32 s16, s16, s18
 ; GCN-NEXT:    s_cselect_b32 s12, s12, s19
 ; GCN-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; GCN-NEXT:    s_subb_u32 s5, s13, s5
@@ -1931,11 +1923,9 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; TONGA-NEXT:    v_readfirstlane_b32 s14, v0
 ; TONGA-NEXT:    s_sub_u32 s12, s12, s14
 ; TONGA-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; TONGA-NEXT:    s_subb_u32 s3, s3, s7
 ; TONGA-NEXT:    s_sub_u32 s18, s12, s6
 ; TONGA-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; TONGA-NEXT:    s_subb_u32 s19, s3, 0
 ; TONGA-NEXT:    s_cmp_ge_u32 s19, s7
 ; TONGA-NEXT:    s_cselect_b32 s20, -1, 0
@@ -1945,12 +1935,10 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; TONGA-NEXT:    s_cselect_b32 s20, s21, s20
 ; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; TONGA-NEXT:    s_subb_u32 s3, s3, s7
-; TONGA-NEXT:    s_sub_u32 s21, s18, s6
-; TONGA-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
+; TONGA-NEXT:    s_sub_u32 s16, s18, s6
 ; TONGA-NEXT:    s_subb_u32 s3, s3, 0
 ; TONGA-NEXT:    s_cmp_lg_u32 s20, 0
-; TONGA-NEXT:    s_cselect_b32 s16, s21, s18
+; TONGA-NEXT:    s_cselect_b32 s16, s16, s18
 ; TONGA-NEXT:    s_cselect_b32 s3, s3, s19
 ; TONGA-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; TONGA-NEXT:    s_subb_u32 s5, s13, s5
@@ -2730,7 +2718,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s7
 ; GCN-NEXT:    s_sub_u32 s9, 0, s6
-; GCN-NEXT:    s_subb_u32 s16, 0, s7
+; GCN-NEXT:    s_subb_u32 s14, 0, s7
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2739,56 +2727,52 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s17, v1
-; GCN-NEXT:    v_readfirstlane_b32 s14, v0
-; GCN-NEXT:    s_mul_i32 s15, s9, s17
-; GCN-NEXT:    s_mul_hi_u32 s19, s9, s14
-; GCN-NEXT:    s_mul_i32 s18, s16, s14
-; GCN-NEXT:    s_add_i32 s15, s19, s15
-; GCN-NEXT:    s_add_i32 s15, s15, s18
-; GCN-NEXT:    s_mul_i32 s20, s9, s14
-; GCN-NEXT:    s_mul_i32 s19, s14, s15
-; GCN-NEXT:    s_mul_hi_u32 s21, s14, s20
-; GCN-NEXT:    s_mul_hi_u32 s18, s14, s15
+; GCN-NEXT:    v_readfirstlane_b32 s15, v1
+; GCN-NEXT:    v_readfirstlane_b32 s16, v0
+; GCN-NEXT:    s_mul_i32 s17, s9, s15
+; GCN-NEXT:    s_mul_hi_u32 s19, s9, s16
+; GCN-NEXT:    s_mul_i32 s18, s14, s16
+; GCN-NEXT:    s_add_i32 s17, s19, s17
+; GCN-NEXT:    s_add_i32 s17, s17, s18
+; GCN-NEXT:    s_mul_i32 s20, s9, s16
+; GCN-NEXT:    s_mul_i32 s19, s16, s17
+; GCN-NEXT:    s_mul_hi_u32 s21, s16, s20
+; GCN-NEXT:    s_mul_hi_u32 s18, s16, s17
 ; GCN-NEXT:    s_add_u32 s19, s21, s19
 ; GCN-NEXT:    s_addc_u32 s18, 0, s18
-; GCN-NEXT:    s_mul_hi_u32 s22, s17, s20
-; GCN-NEXT:    s_mul_i32 s20, s17, s20
+; GCN-NEXT:    s_mul_hi_u32 s22, s15, s20
+; GCN-NEXT:    s_mul_i32 s20, s15, s20
 ; GCN-NEXT:    s_add_u32 s19, s19, s20
-; GCN-NEXT:    s_mul_hi_u32 s21, s17, s15
+; GCN-NEXT:    s_mul_hi_u32 s21, s15, s17
 ; GCN-NEXT:    s_addc_u32 s18, s18, s22
 ; GCN-NEXT:    s_addc_u32 s19, s21, 0
-; GCN-NEXT:    s_mul_i32 s15, s17, s15
-; GCN-NEXT:    s_add_u32 s15, s18, s15
+; GCN-NEXT:    s_mul_i32 s17, s15, s17
+; GCN-NEXT:    s_add_u32 s17, s18, s17
 ; GCN-NEXT:    s_addc_u32 s18, 0, s19
-; GCN-NEXT:    s_add_u32 s19, s14, s15
-; GCN-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[14:15], 0
-; GCN-NEXT:    s_addc_u32 s17, s17, s18
-; GCN-NEXT:    s_mul_i32 s14, s9, s17
-; GCN-NEXT:    s_mul_hi_u32 s15, s9, s19
-; GCN-NEXT:    s_add_i32 s14, s15, s14
-; GCN-NEXT:    s_mul_i32 s16, s16, s19
-; GCN-NEXT:    s_add_i32 s14, s14, s16
-; GCN-NEXT:    s_mul_i32 s9, s9, s19
-; GCN-NEXT:    s_mul_hi_u32 s16, s17, s9
-; GCN-NEXT:    s_mul_i32 s18, s17, s9
-; GCN-NEXT:    s_mul_i32 s21, s19, s14
-; GCN-NEXT:    s_mul_hi_u32 s9, s19, s9
-; GCN-NEXT:    s_mul_hi_u32 s20, s19, s14
+; GCN-NEXT:    s_add_u32 s16, s16, s17
+; GCN-NEXT:    s_addc_u32 s15, s15, s18
+; GCN-NEXT:    s_mul_i32 s17, s9, s15
+; GCN-NEXT:    s_mul_hi_u32 s18, s9, s16
+; GCN-NEXT:    s_add_i32 s17, s18, s17
+; GCN-NEXT:    s_mul_i32 s14, s14, s16
+; GCN-NEXT:    s_add_i32 s17, s17, s14
+; GCN-NEXT:    s_mul_i32 s9, s9, s16
+; GCN-NEXT:    s_mul_hi_u32 s18, s15, s9
+; GCN-NEXT:    s_mul_i32 s19, s15, s9
+; GCN-NEXT:    s_mul_i32 s21, s16, s17
+; GCN-NEXT:    s_mul_hi_u32 s9, s16, s9
+; GCN-NEXT:    s_mul_hi_u32 s20, s16, s17
 ; GCN-NEXT:    s_add_u32 s9, s9, s21
 ; GCN-NEXT:    s_addc_u32 s20, 0, s20
-; GCN-NEXT:    s_add_u32 s9, s9, s18
-; GCN-NEXT:    s_mul_hi_u32 s15, s17, s14
-; GCN-NEXT:    s_addc_u32 s9, s20, s16
-; GCN-NEXT:    s_addc_u32 s15, s15, 0
-; GCN-NEXT:    s_mul_i32 s14, s17, s14
-; GCN-NEXT:    s_add_u32 s9, s9, s14
-; GCN-NEXT:    s_addc_u32 s16, 0, s15
-; GCN-NEXT:    s_add_u32 s9, s19, s9
-; GCN-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[14:15], 0
-; GCN-NEXT:    s_addc_u32 s18, s17, s16
+; GCN-NEXT:    s_add_u32 s9, s9, s19
+; GCN-NEXT:    s_mul_hi_u32 s14, s15, s17
+; GCN-NEXT:    s_addc_u32 s9, s20, s18
+; GCN-NEXT:    s_addc_u32 s14, s14, 0
+; GCN-NEXT:    s_mul_i32 s17, s15, s17
+; GCN-NEXT:    s_add_u32 s9, s9, s17
+; GCN-NEXT:    s_addc_u32 s14, 0, s14
+; GCN-NEXT:    s_add_u32 s9, s16, s9
+; GCN-NEXT:    s_addc_u32 s18, s15, s14
 ; GCN-NEXT:    s_ashr_i32 s14, s11, 31
 ; GCN-NEXT:    s_add_u32 s16, s10, s14
 ; GCN-NEXT:    s_mov_b32 s15, s14
@@ -2817,11 +2801,9 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_mul_i32 s9, s6, s9
 ; GCN-NEXT:    s_sub_u32 s9, s16, s9
 ; GCN-NEXT:    s_cselect_b64 s[18:19], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[18:19], 0
 ; GCN-NEXT:    s_subb_u32 s16, s20, s7
 ; GCN-NEXT:    s_sub_u32 s22, s9, s6
 ; GCN-NEXT:    s_cselect_b64 s[20:21], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[20:21], 0
 ; GCN-NEXT:    s_subb_u32 s23, s16, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s23, s7
 ; GCN-NEXT:    s_cselect_b32 s24, -1, 0
@@ -2831,12 +2813,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_cselect_b32 s24, s25, s24
 ; GCN-NEXT:    s_cmp_lg_u64 s[20:21], 0
 ; GCN-NEXT:    s_subb_u32 s16, s16, s7
-; GCN-NEXT:    s_sub_u32 s25, s22, s6
-; GCN-NEXT:    s_cselect_b64 s[20:21], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[20:21], 0
+; GCN-NEXT:    s_sub_u32 s20, s22, s6
 ; GCN-NEXT:    s_subb_u32 s16, s16, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s24, 0
-; GCN-NEXT:    s_cselect_b32 s20, s25, s22
+; GCN-NEXT:    s_cselect_b32 s20, s20, s22
 ; GCN-NEXT:    s_cselect_b32 s16, s16, s23
 ; GCN-NEXT:    s_cmp_lg_u64 s[18:19], 0
 ; GCN-NEXT:    s_subb_u32 s11, s17, s11
@@ -2887,7 +2867,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s10
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s11
 ; GCN-NEXT:    s_sub_u32 s3, 0, s10
-; GCN-NEXT:    s_subb_u32 s14, 0, s11
+; GCN-NEXT:    s_subb_u32 s12, 0, s11
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2896,56 +2876,52 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s15, v1
-; GCN-NEXT:    v_readfirstlane_b32 s12, v0
-; GCN-NEXT:    s_mul_i32 s13, s3, s15
-; GCN-NEXT:    s_mul_hi_u32 s17, s3, s12
-; GCN-NEXT:    s_mul_i32 s16, s14, s12
-; GCN-NEXT:    s_add_i32 s13, s17, s13
-; GCN-NEXT:    s_add_i32 s13, s13, s16
-; GCN-NEXT:    s_mul_i32 s18, s3, s12
-; GCN-NEXT:    s_mul_i32 s17, s12, s13
-; GCN-NEXT:    s_mul_hi_u32 s19, s12, s18
-; GCN-NEXT:    s_mul_hi_u32 s16, s12, s13
+; GCN-NEXT:    v_readfirstlane_b32 s13, v1
+; GCN-NEXT:    v_readfirstlane_b32 s14, v0
+; GCN-NEXT:    s_mul_i32 s15, s3, s13
+; GCN-NEXT:    s_mul_hi_u32 s17, s3, s14
+; GCN-NEXT:    s_mul_i32 s16, s12, s14
+; GCN-NEXT:    s_add_i32 s15, s17, s15
+; GCN-NEXT:    s_add_i32 s15, s15, s16
+; GCN-NEXT:    s_mul_i32 s18, s3, s14
+; GCN-NEXT:    s_mul_i32 s17, s14, s15
+; GCN-NEXT:    s_mul_hi_u32 s19, s14, s18
+; GCN-NEXT:    s_mul_hi_u32 s16, s14, s15
 ; GCN-NEXT:    s_add_u32 s17, s19, s17
 ; GCN-NEXT:    s_addc_u32 s16, 0, s16
-; GCN-NEXT:    s_mul_hi_u32 s20, s15, s18
-; GCN-NEXT:    s_mul_i32 s18, s15, s18
+; GCN-NEXT:    s_mul_hi_u32 s20, s13, s18
+; GCN-NEXT:    s_mul_i32 s18, s13, s18
 ; GCN-NEXT:    s_add_u32 s17, s17, s18
-; GCN-NEXT:    s_mul_hi_u32 s19, s15, s13
+; GCN-NEXT:    s_mul_hi_u32 s19, s13, s15
 ; GCN-NEXT:    s_addc_u32 s16, s16, s20
 ; GCN-NEXT:    s_addc_u32 s17, s19, 0
-; GCN-NEXT:    s_mul_i32 s13, s15, s13
-; GCN-NEXT:    s_add_u32 s13, s16, s13
+; GCN-NEXT:    s_mul_i32 s15, s13, s15
+; GCN-NEXT:    s_add_u32 s15, s16, s15
 ; GCN-NEXT:    s_addc_u32 s16, 0, s17
-; GCN-NEXT:    s_add_u32 s17, s12, s13
-; GCN-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[12:13], 0
-; GCN-NEXT:    s_addc_u32 s15, s15, s16
-; GCN-NEXT:    s_mul_i32 s12, s3, s15
-; GCN-NEXT:    s_mul_hi_u32 s13, s3, s17
-; GCN-NEXT:    s_add_i32 s12, s13, s12
-; GCN-NEXT:    s_mul_i32 s14, s14, s17
-; GCN-NEXT:    s_add_i32 s12, s12, s14
-; GCN-NEXT:    s_mul_i32 s3, s3, s17
-; GCN-NEXT:    s_mul_hi_u32 s14, s15, s3
-; GCN-NEXT:    s_mul_i32 s16, s15, s3
-; GCN-NEXT:    s_mul_i32 s19, s17, s12
-; GCN-NEXT:    s_mul_hi_u32 s3, s17, s3
-; GCN-NEXT:    s_mul_hi_u32 s18, s17, s12
+; GCN-NEXT:    s_add_u32 s14, s14, s15
+; GCN-NEXT:    s_addc_u32 s13, s13, s16
+; GCN-NEXT:    s_mul_i32 s15, s3, s13
+; GCN-NEXT:    s_mul_hi_u32 s16, s3, s14
+; GCN-NEXT:    s_add_i32 s15, s16, s15
+; GCN-NEXT:    s_mul_i32 s12, s12, s14
+; GCN-NEXT:    s_add_i32 s15, s15, s12
+; GCN-NEXT:    s_mul_i32 s3, s3, s14
+; GCN-NEXT:    s_mul_hi_u32 s16, s13, s3
+; GCN-NEXT:    s_mul_i32 s17, s13, s3
+; GCN-NEXT:    s_mul_i32 s19, s14, s15
+; GCN-NEXT:    s_mul_hi_u32 s3, s14, s3
+; GCN-NEXT:    s_mul_hi_u32 s18, s14, s15
 ; GCN-NEXT:    s_add_u32 s3, s3, s19
 ; GCN-NEXT:    s_addc_u32 s18, 0, s18
-; GCN-NEXT:    s_add_u32 s3, s3, s16
-; GCN-NEXT:    s_mul_hi_u32 s13, s15, s12
-; GCN-NEXT:    s_addc_u32 s3, s18, s14
-; GCN-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-NEXT:    s_mul_i32 s12, s15, s12
-; GCN-NEXT:    s_add_u32 s3, s3, s12
-; GCN-NEXT:    s_addc_u32 s14, 0, s13
-; GCN-NEXT:    s_add_u32 s3, s17, s3
-; GCN-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[12:13], 0
-; GCN-NEXT:    s_addc_u32 s16, s15, s14
+; GCN-NEXT:    s_add_u32 s3, s3, s17
+; GCN-NEXT:    s_mul_hi_u32 s12, s13, s15
+; GCN-NEXT:    s_addc_u32 s3, s18, s16
+; GCN-NEXT:    s_addc_u32 s12, s12, 0
+; GCN-NEXT:    s_mul_i32 s15, s13, s15
+; GCN-NEXT:    s_add_u32 s3, s3, s15
+; GCN-NEXT:    s_addc_u32 s12, 0, s12
+; GCN-NEXT:    s_add_u32 s3, s14, s3
+; GCN-NEXT:    s_addc_u32 s16, s13, s12
 ; GCN-NEXT:    s_ashr_i32 s12, s5, 31
 ; GCN-NEXT:    s_add_u32 s14, s4, s12
 ; GCN-NEXT:    s_mov_b32 s13, s12
@@ -2974,11 +2950,9 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_mul_i32 s3, s10, s3
 ; GCN-NEXT:    s_sub_u32 s3, s14, s3
 ; GCN-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; GCN-NEXT:    s_subb_u32 s14, s18, s11
 ; GCN-NEXT:    s_sub_u32 s20, s3, s10
 ; GCN-NEXT:    s_cselect_b64 s[18:19], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[18:19], 0
 ; GCN-NEXT:    s_subb_u32 s21, s14, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s21, s11
 ; GCN-NEXT:    s_cselect_b32 s22, -1, 0
@@ -2988,12 +2962,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_cselect_b32 s22, s23, s22
 ; GCN-NEXT:    s_cmp_lg_u64 s[18:19], 0
 ; GCN-NEXT:    s_subb_u32 s14, s14, s11
-; GCN-NEXT:    s_sub_u32 s23, s20, s10
-; GCN-NEXT:    s_cselect_b64 s[18:19], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[18:19], 0
+; GCN-NEXT:    s_sub_u32 s18, s20, s10
 ; GCN-NEXT:    s_subb_u32 s14, s14, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s22, 0
-; GCN-NEXT:    s_cselect_b32 s18, s23, s20
+; GCN-NEXT:    s_cselect_b32 s18, s18, s20
 ; GCN-NEXT:    s_cselect_b32 s14, s14, s21
 ; GCN-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; GCN-NEXT:    s_subb_u32 s5, s15, s5
@@ -3463,11 +3435,9 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_readfirstlane_b32 s14, v0
 ; TONGA-NEXT:    s_sub_u32 s12, s12, s14
 ; TONGA-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; TONGA-NEXT:    s_subb_u32 s1, s1, s7
 ; TONGA-NEXT:    s_sub_u32 s18, s12, s6
 ; TONGA-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; TONGA-NEXT:    s_subb_u32 s19, s1, 0
 ; TONGA-NEXT:    s_cmp_ge_u32 s19, s7
 ; TONGA-NEXT:    s_cselect_b32 s20, -1, 0
@@ -3477,12 +3447,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    s_cselect_b32 s20, s21, s20
 ; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; TONGA-NEXT:    s_subb_u32 s1, s1, s7
-; TONGA-NEXT:    s_sub_u32 s21, s18, s6
-; TONGA-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
+; TONGA-NEXT:    s_sub_u32 s16, s18, s6
 ; TONGA-NEXT:    s_subb_u32 s1, s1, 0
 ; TONGA-NEXT:    s_cmp_lg_u32 s20, 0
-; TONGA-NEXT:    s_cselect_b32 s16, s21, s18
+; TONGA-NEXT:    s_cselect_b32 s16, s16, s18
 ; TONGA-NEXT:    s_cselect_b32 s1, s1, s19
 ; TONGA-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; TONGA-NEXT:    s_subb_u32 s3, s13, s3
@@ -4934,7 +4902,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s7
 ; GCN-NEXT:    s_sub_u32 s17, 0, s6
-; GCN-NEXT:    s_subb_u32 s24, 0, s7
+; GCN-NEXT:    s_subb_u32 s22, 0, s7
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -4943,56 +4911,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s25, v1
-; GCN-NEXT:    v_readfirstlane_b32 s22, v0
-; GCN-NEXT:    s_mul_i32 s23, s17, s25
-; GCN-NEXT:    s_mul_hi_u32 s27, s17, s22
-; GCN-NEXT:    s_mul_i32 s26, s24, s22
-; GCN-NEXT:    s_add_i32 s23, s27, s23
-; GCN-NEXT:    s_add_i32 s23, s23, s26
-; GCN-NEXT:    s_mul_i32 s28, s17, s22
-; GCN-NEXT:    s_mul_i32 s27, s22, s23
-; GCN-NEXT:    s_mul_hi_u32 s29, s22, s28
-; GCN-NEXT:    s_mul_hi_u32 s26, s22, s23
+; GCN-NEXT:    v_readfirstlane_b32 s23, v1
+; GCN-NEXT:    v_readfirstlane_b32 s24, v0
+; GCN-NEXT:    s_mul_i32 s25, s17, s23
+; GCN-NEXT:    s_mul_hi_u32 s27, s17, s24
+; GCN-NEXT:    s_mul_i32 s26, s22, s24
+; GCN-NEXT:    s_add_i32 s25, s27, s25
+; GCN-NEXT:    s_add_i32 s25, s25, s26
+; GCN-NEXT:    s_mul_i32 s28, s17, s24
+; GCN-NEXT:    s_mul_i32 s27, s24, s25
+; GCN-NEXT:    s_mul_hi_u32 s29, s24, s28
+; GCN-NEXT:    s_mul_hi_u32 s26, s24, s25
 ; GCN-NEXT:    s_add_u32 s27, s29, s27
 ; GCN-NEXT:    s_addc_u32 s26, 0, s26
-; GCN-NEXT:    s_mul_hi_u32 s30, s25, s28
-; GCN-NEXT:    s_mul_i32 s28, s25, s28
+; GCN-NEXT:    s_mul_hi_u32 s30, s23, s28
+; GCN-NEXT:    s_mul_i32 s28, s23, s28
 ; GCN-NEXT:    s_add_u32 s27, s27, s28
-; GCN-NEXT:    s_mul_hi_u32 s29, s25, s23
+; GCN-NEXT:    s_mul_hi_u32 s29, s23, s25
 ; GCN-NEXT:    s_addc_u32 s26, s26, s30
 ; GCN-NEXT:    s_addc_u32 s27, s29, 0
-; GCN-NEXT:    s_mul_i32 s23, s25, s23
-; GCN-NEXT:    s_add_u32 s23, s26, s23
+; GCN-NEXT:    s_mul_i32 s25, s23, s25
+; GCN-NEXT:    s_add_u32 s25, s26, s25
 ; GCN-NEXT:    s_addc_u32 s26, 0, s27
-; GCN-NEXT:    s_add_u32 s27, s22, s23
-; GCN-NEXT:    s_cselect_b64 s[22:23], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[22:23], 0
-; GCN-NEXT:    s_addc_u32 s25, s25, s26
-; GCN-NEXT:    s_mul_i32 s22, s17, s25
-; GCN-NEXT:    s_mul_hi_u32 s23, s17, s27
-; GCN-NEXT:    s_add_i32 s22, s23, s22
-; GCN-NEXT:    s_mul_i32 s24, s24, s27
-; GCN-NEXT:    s_add_i32 s22, s22, s24
-; GCN-NEXT:    s_mul_i32 s17, s17, s27
-; GCN-NEXT:    s_mul_hi_u32 s24, s25, s17
-; GCN-NEXT:    s_mul_i32 s26, s25, s17
-; GCN-NEXT:    s_mul_i32 s29, s27, s22
-; GCN-NEXT:    s_mul_hi_u32 s17, s27, s17
-; GCN-NEXT:    s_mul_hi_u32 s28, s27, s22
+; GCN-NEXT:    s_add_u32 s24, s24, s25
+; GCN-NEXT:    s_addc_u32 s23, s23, s26
+; GCN-NEXT:    s_mul_i32 s25, s17, s23
+; GCN-NEXT:    s_mul_hi_u32 s26, s17, s24
+; GCN-NEXT:    s_add_i32 s25, s26, s25
+; GCN-NEXT:    s_mul_i32 s22, s22, s24
+; GCN-NEXT:    s_add_i32 s25, s25, s22
+; GCN-NEXT:    s_mul_i32 s17, s17, s24
+; GCN-NEXT:    s_mul_hi_u32 s26, s23, s17
+; GCN-NEXT:    s_mul_i32 s27, s23, s17
+; GCN-NEXT:    s_mul_i32 s29, s24, s25
+; GCN-NEXT:    s_mul_hi_u32 s17, s24, s17
+; GCN-NEXT:    s_mul_hi_u32 s28, s24, s25
 ; GCN-NEXT:    s_add_u32 s17, s17, s29
 ; GCN-NEXT:    s_addc_u32 s28, 0, s28
-; GCN-NEXT:    s_add_u32 s17, s17, s26
-; GCN-NEXT:    s_mul_hi_u32 s23, s25, s22
-; GCN-NEXT:    s_addc_u32 s17, s28, s24
-; GCN-NEXT:    s_addc_u32 s23, s23, 0
-; GCN-NEXT:    s_mul_i32 s22, s25, s22
-; GCN-NEXT:    s_add_u32 s17, s17, s22
-; GCN-NEXT:    s_addc_u32 s24, 0, s23
-; GCN-NEXT:    s_add_u32 s17, s27, s17
-; GCN-NEXT:    s_cselect_b64 s[22:23], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[22:23], 0
-; GCN-NEXT:    s_addc_u32 s26, s25, s24
+; GCN-NEXT:    s_add_u32 s17, s17, s27
+; GCN-NEXT:    s_mul_hi_u32 s22, s23, s25
+; GCN-NEXT:    s_addc_u32 s17, s28, s26
+; GCN-NEXT:    s_addc_u32 s22, s22, 0
+; GCN-NEXT:    s_mul_i32 s25, s23, s25
+; GCN-NEXT:    s_add_u32 s17, s17, s25
+; GCN-NEXT:    s_addc_u32 s22, 0, s22
+; GCN-NEXT:    s_add_u32 s17, s24, s17
+; GCN-NEXT:    s_addc_u32 s26, s23, s22
 ; GCN-NEXT:    s_ashr_i32 s22, s19, 31
 ; GCN-NEXT:    s_add_u32 s24, s18, s22
 ; GCN-NEXT:    s_mov_b32 s23, s22
@@ -5021,11 +4985,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_mul_i32 s17, s6, s17
 ; GCN-NEXT:    s_sub_u32 s17, s24, s17
 ; GCN-NEXT:    s_cselect_b64 s[26:27], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[26:27], 0
 ; GCN-NEXT:    s_subb_u32 s24, s28, s7
 ; GCN-NEXT:    s_sub_u32 s30, s17, s6
 ; GCN-NEXT:    s_cselect_b64 s[28:29], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[28:29], 0
 ; GCN-NEXT:    s_subb_u32 s31, s24, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s31, s7
 ; GCN-NEXT:    s_cselect_b32 s33, -1, 0
@@ -5035,12 +4997,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_cselect_b32 s33, s34, s33
 ; GCN-NEXT:    s_cmp_lg_u64 s[28:29], 0
 ; GCN-NEXT:    s_subb_u32 s24, s24, s7
-; GCN-NEXT:    s_sub_u32 s34, s30, s6
-; GCN-NEXT:    s_cselect_b64 s[28:29], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[28:29], 0
+; GCN-NEXT:    s_sub_u32 s28, s30, s6
 ; GCN-NEXT:    s_subb_u32 s24, s24, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s33, 0
-; GCN-NEXT:    s_cselect_b32 s28, s34, s30
+; GCN-NEXT:    s_cselect_b32 s28, s28, s30
 ; GCN-NEXT:    s_cselect_b32 s24, s24, s31
 ; GCN-NEXT:    s_cmp_lg_u64 s[26:27], 0
 ; GCN-NEXT:    s_subb_u32 s19, s25, s19
@@ -5091,7 +5051,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s18
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s19
 ; GCN-NEXT:    s_sub_u32 s13, 0, s18
-; GCN-NEXT:    s_subb_u32 s22, 0, s19
+; GCN-NEXT:    s_subb_u32 s20, 0, s19
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -5100,56 +5060,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s23, v1
-; GCN-NEXT:    v_readfirstlane_b32 s20, v0
-; GCN-NEXT:    s_mul_i32 s21, s13, s23
-; GCN-NEXT:    s_mul_hi_u32 s25, s13, s20
-; GCN-NEXT:    s_mul_i32 s24, s22, s20
-; GCN-NEXT:    s_add_i32 s21, s25, s21
-; GCN-NEXT:    s_add_i32 s21, s21, s24
-; GCN-NEXT:    s_mul_i32 s26, s13, s20
-; GCN-NEXT:    s_mul_i32 s25, s20, s21
-; GCN-NEXT:    s_mul_hi_u32 s27, s20, s26
-; GCN-NEXT:    s_mul_hi_u32 s24, s20, s21
+; GCN-NEXT:    v_readfirstlane_b32 s21, v1
+; GCN-NEXT:    v_readfirstlane_b32 s22, v0
+; GCN-NEXT:    s_mul_i32 s23, s13, s21
+; GCN-NEXT:    s_mul_hi_u32 s25, s13, s22
+; GCN-NEXT:    s_mul_i32 s24, s20, s22
+; GCN-NEXT:    s_add_i32 s23, s25, s23
+; GCN-NEXT:    s_add_i32 s23, s23, s24
+; GCN-NEXT:    s_mul_i32 s26, s13, s22
+; GCN-NEXT:    s_mul_i32 s25, s22, s23
+; GCN-NEXT:    s_mul_hi_u32 s27, s22, s26
+; GCN-NEXT:    s_mul_hi_u32 s24, s22, s23
 ; GCN-NEXT:    s_add_u32 s25, s27, s25
 ; GCN-NEXT:    s_addc_u32 s24, 0, s24
-; GCN-NEXT:    s_mul_hi_u32 s28, s23, s26
-; GCN-NEXT:    s_mul_i32 s26, s23, s26
+; GCN-NEXT:    s_mul_hi_u32 s28, s21, s26
+; GCN-NEXT:    s_mul_i32 s26, s21, s26
 ; GCN-NEXT:    s_add_u32 s25, s25, s26
-; GCN-NEXT:    s_mul_hi_u32 s27, s23, s21
+; GCN-NEXT:    s_mul_hi_u32 s27, s21, s23
 ; GCN-NEXT:    s_addc_u32 s24, s24, s28
 ; GCN-NEXT:    s_addc_u32 s25, s27, 0
-; GCN-NEXT:    s_mul_i32 s21, s23, s21
-; GCN-NEXT:    s_add_u32 s21, s24, s21
+; GCN-NEXT:    s_mul_i32 s23, s21, s23
+; GCN-NEXT:    s_add_u32 s23, s24, s23
 ; GCN-NEXT:    s_addc_u32 s24, 0, s25
-; GCN-NEXT:    s_add_u32 s25, s20, s21
-; GCN-NEXT:    s_cselect_b64 s[20:21], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[20:21], 0
-; GCN-NEXT:    s_addc_u32 s23, s23, s24
-; GCN-NEXT:    s_mul_i32 s20, s13, s23
-; GCN-NEXT:    s_mul_hi_u32 s21, s13, s25
-; GCN-NEXT:    s_add_i32 s20, s21, s20
-; GCN-NEXT:    s_mul_i32 s22, s22, s25
-; GCN-NEXT:    s_add_i32 s20, s20, s22
-; GCN-NEXT:    s_mul_i32 s13, s13, s25
-; GCN-NEXT:    s_mul_hi_u32 s22, s23, s13
-; GCN-NEXT:    s_mul_i32 s24, s23, s13
-; GCN-NEXT:    s_mul_i32 s27, s25, s20
-; GCN-NEXT:    s_mul_hi_u32 s13, s25, s13
-; GCN-NEXT:    s_mul_hi_u32 s26, s25, s20
+; GCN-NEXT:    s_add_u32 s22, s22, s23
+; GCN-NEXT:    s_addc_u32 s21, s21, s24
+; GCN-NEXT:    s_mul_i32 s23, s13, s21
+; GCN-NEXT:    s_mul_hi_u32 s24, s13, s22
+; GCN-NEXT:    s_add_i32 s23, s24, s23
+; GCN-NEXT:    s_mul_i32 s20, s20, s22
+; GCN-NEXT:    s_add_i32 s23, s23, s20
+; GCN-NEXT:    s_mul_i32 s13, s13, s22
+; GCN-NEXT:    s_mul_hi_u32 s24, s21, s13
+; GCN-NEXT:    s_mul_i32 s25, s21, s13
+; GCN-NEXT:    s_mul_i32 s27, s22, s23
+; GCN-NEXT:    s_mul_hi_u32 s13, s22, s13
+; GCN-NEXT:    s_mul_hi_u32 s26, s22, s23
 ; GCN-NEXT:    s_add_u32 s13, s13, s27
 ; GCN-NEXT:    s_addc_u32 s26, 0, s26
-; GCN-NEXT:    s_add_u32 s13, s13, s24
-; GCN-NEXT:    s_mul_hi_u32 s21, s23, s20
-; GCN-NEXT:    s_addc_u32 s13, s26, s22
-; GCN-NEXT:    s_addc_u32 s21, s21, 0
-; GCN-NEXT:    s_mul_i32 s20, s23, s20
-; GCN-NEXT:    s_add_u32 s13, s13, s20
-; GCN-NEXT:    s_addc_u32 s22, 0, s21
-; GCN-NEXT:    s_add_u32 s13, s25, s13
-; GCN-NEXT:    s_cselect_b64 s[20:21], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[20:21], 0
-; GCN-NEXT:    s_addc_u32 s24, s23, s22
+; GCN-NEXT:    s_add_u32 s13, s13, s25
+; GCN-NEXT:    s_mul_hi_u32 s20, s21, s23
+; GCN-NEXT:    s_addc_u32 s13, s26, s24
+; GCN-NEXT:    s_addc_u32 s20, s20, 0
+; GCN-NEXT:    s_mul_i32 s23, s21, s23
+; GCN-NEXT:    s_add_u32 s13, s13, s23
+; GCN-NEXT:    s_addc_u32 s20, 0, s20
+; GCN-NEXT:    s_add_u32 s13, s22, s13
+; GCN-NEXT:    s_addc_u32 s24, s21, s20
 ; GCN-NEXT:    s_ashr_i32 s20, s15, 31
 ; GCN-NEXT:    s_add_u32 s22, s14, s20
 ; GCN-NEXT:    s_mov_b32 s21, s20
@@ -5178,11 +5134,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_mul_i32 s13, s18, s13
 ; GCN-NEXT:    s_sub_u32 s13, s22, s13
 ; GCN-NEXT:    s_cselect_b64 s[24:25], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[24:25], 0
 ; GCN-NEXT:    s_subb_u32 s22, s26, s19
 ; GCN-NEXT:    s_sub_u32 s28, s13, s18
 ; GCN-NEXT:    s_cselect_b64 s[26:27], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[26:27], 0
 ; GCN-NEXT:    s_subb_u32 s29, s22, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s29, s19
 ; GCN-NEXT:    s_cselect_b32 s30, -1, 0
@@ -5192,12 +5146,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_cselect_b32 s30, s31, s30
 ; GCN-NEXT:    s_cmp_lg_u64 s[26:27], 0
 ; GCN-NEXT:    s_subb_u32 s22, s22, s19
-; GCN-NEXT:    s_sub_u32 s31, s28, s18
-; GCN-NEXT:    s_cselect_b64 s[26:27], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[26:27], 0
+; GCN-NEXT:    s_sub_u32 s26, s28, s18
 ; GCN-NEXT:    s_subb_u32 s22, s22, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s30, 0
-; GCN-NEXT:    s_cselect_b32 s26, s31, s28
+; GCN-NEXT:    s_cselect_b32 s26, s26, s28
 ; GCN-NEXT:    s_cselect_b32 s22, s22, s29
 ; GCN-NEXT:    s_cmp_lg_u64 s[24:25], 0
 ; GCN-NEXT:    s_subb_u32 s15, s23, s15
@@ -5257,7 +5209,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s14
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s15
 ; GCN-NEXT:    s_sub_u32 s9, 0, s14
-; GCN-NEXT:    s_subb_u32 s18, 0, s15
+; GCN-NEXT:    s_subb_u32 s16, 0, s15
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -5266,56 +5218,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s19, v1
-; GCN-NEXT:    v_readfirstlane_b32 s16, v0
-; GCN-NEXT:    s_mul_i32 s17, s9, s19
-; GCN-NEXT:    s_mul_hi_u32 s21, s9, s16
-; GCN-NEXT:    s_mul_i32 s20, s18, s16
-; GCN-NEXT:    s_add_i32 s17, s21, s17
-; GCN-NEXT:    s_add_i32 s17, s17, s20
-; GCN-NEXT:    s_mul_i32 s22, s9, s16
-; GCN-NEXT:    s_mul_i32 s21, s16, s17
-; GCN-NEXT:    s_mul_hi_u32 s23, s16, s22
-; GCN-NEXT:    s_mul_hi_u32 s20, s16, s17
+; GCN-NEXT:    v_readfirstlane_b32 s17, v1
+; GCN-NEXT:    v_readfirstlane_b32 s18, v0
+; GCN-NEXT:    s_mul_i32 s19, s9, s17
+; GCN-NEXT:    s_mul_hi_u32 s21, s9, s18
+; GCN-NEXT:    s_mul_i32 s20, s16, s18
+; GCN-NEXT:    s_add_i32 s19, s21, s19
+; GCN-NEXT:    s_add_i32 s19, s19, s20
+; GCN-NEXT:    s_mul_i32 s22, s9, s18
+; GCN-NEXT:    s_mul_i32 s21, s18, s19
+; GCN-NEXT:    s_mul_hi_u32 s23, s18, s22
+; GCN-NEXT:    s_mul_hi_u32 s20, s18, s19
 ; GCN-NEXT:    s_add_u32 s21, s23, s21
 ; GCN-NEXT:    s_addc_u32 s20, 0, s20
-; GCN-NEXT:    s_mul_hi_u32 s24, s19, s22
-; GCN-NEXT:    s_mul_i32 s22, s19, s22
+; GCN-NEXT:    s_mul_hi_u32 s24, s17, s22
+; GCN-NEXT:    s_mul_i32 s22, s17, s22
 ; GCN-NEXT:    s_add_u32 s21, s21, s22
-; GCN-NEXT:    s_mul_hi_u32 s23, s19, s17
+; GCN-NEXT:    s_mul_hi_u32 s23, s17, s19
 ; GCN-NEXT:    s_addc_u32 s20, s20, s24
 ; GCN-NEXT:    s_addc_u32 s21, s23, 0
-; GCN-NEXT:    s_mul_i32 s17, s19, s17
-; GCN-NEXT:    s_add_u32 s17, s20, s17
+; GCN-NEXT:    s_mul_i32 s19, s17, s19
+; GCN-NEXT:    s_add_u32 s19, s20, s19
 ; GCN-NEXT:    s_addc_u32 s20, 0, s21
-; GCN-NEXT:    s_add_u32 s21, s16, s17
-; GCN-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[16:17], 0
-; GCN-NEXT:    s_addc_u32 s19, s19, s20
-; GCN-NEXT:    s_mul_i32 s16, s9, s19
-; GCN-NEXT:    s_mul_hi_u32 s17, s9, s21
-; GCN-NEXT:    s_add_i32 s16, s17, s16
-; GCN-NEXT:    s_mul_i32 s18, s18, s21
-; GCN-NEXT:    s_add_i32 s16, s16, s18
-; GCN-NEXT:    s_mul_i32 s9, s9, s21
-; GCN-NEXT:    s_mul_hi_u32 s18, s19, s9
-; GCN-NEXT:    s_mul_i32 s20, s19, s9
-; GCN-NEXT:    s_mul_i32 s23, s21, s16
-; GCN-NEXT:    s_mul_hi_u32 s9, s21, s9
-; GCN-NEXT:    s_mul_hi_u32 s22, s21, s16
+; GCN-NEXT:    s_add_u32 s18, s18, s19
+; GCN-NEXT:    s_addc_u32 s17, s17, s20
+; GCN-NEXT:    s_mul_i32 s19, s9, s17
+; GCN-NEXT:    s_mul_hi_u32 s20, s9, s18
+; GCN-NEXT:    s_add_i32 s19, s20, s19
+; GCN-NEXT:    s_mul_i32 s16, s16, s18
+; GCN-NEXT:    s_add_i32 s19, s19, s16
+; GCN-NEXT:    s_mul_i32 s9, s9, s18
+; GCN-NEXT:    s_mul_hi_u32 s20, s17, s9
+; GCN-NEXT:    s_mul_i32 s21, s17, s9
+; GCN-NEXT:    s_mul_i32 s23, s18, s19
+; GCN-NEXT:    s_mul_hi_u32 s9, s18, s9
+; GCN-NEXT:    s_mul_hi_u32 s22, s18, s19
 ; GCN-NEXT:    s_add_u32 s9, s9, s23
 ; GCN-NEXT:    s_addc_u32 s22, 0, s22
-; GCN-NEXT:    s_add_u32 s9, s9, s20
-; GCN-NEXT:    s_mul_hi_u32 s17, s19, s16
-; GCN-NEXT:    s_addc_u32 s9, s22, s18
-; GCN-NEXT:    s_addc_u32 s17, s17, 0
-; GCN-NEXT:    s_mul_i32 s16, s19, s16
-; GCN-NEXT:    s_add_u32 s9, s9, s16
-; GCN-NEXT:    s_addc_u32 s18, 0, s17
-; GCN-NEXT:    s_add_u32 s9, s21, s9
-; GCN-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[16:17], 0
-; GCN-NEXT:    s_addc_u32 s20, s19, s18
+; GCN-NEXT:    s_add_u32 s9, s9, s21
+; GCN-NEXT:    s_mul_hi_u32 s16, s17, s19
+; GCN-NEXT:    s_addc_u32 s9, s22, s20
+; GCN-NEXT:    s_addc_u32 s16, s16, 0
+; GCN-NEXT:    s_mul_i32 s19, s17, s19
+; GCN-NEXT:    s_add_u32 s9, s9, s19
+; GCN-NEXT:    s_addc_u32 s16, 0, s16
+; GCN-NEXT:    s_add_u32 s9, s18, s9
+; GCN-NEXT:    s_addc_u32 s20, s17, s16
 ; GCN-NEXT:    s_ashr_i32 s16, s11, 31
 ; GCN-NEXT:    s_add_u32 s18, s10, s16
 ; GCN-NEXT:    s_mov_b32 s17, s16
@@ -5344,11 +5292,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_mul_i32 s9, s14, s9
 ; GCN-NEXT:    s_sub_u32 s9, s18, s9
 ; GCN-NEXT:    s_cselect_b64 s[20:21], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[20:21], 0
 ; GCN-NEXT:    s_subb_u32 s18, s22, s15
 ; GCN-NEXT:    s_sub_u32 s24, s9, s14
 ; GCN-NEXT:    s_cselect_b64 s[22:23], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[22:23], 0
 ; GCN-NEXT:    s_subb_u32 s25, s18, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s25, s15
 ; GCN-NEXT:    s_cselect_b32 s26, -1, 0
@@ -5358,12 +5304,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_cselect_b32 s26, s27, s26
 ; GCN-NEXT:    s_cmp_lg_u64 s[22:23], 0
 ; GCN-NEXT:    s_subb_u32 s18, s18, s15
-; GCN-NEXT:    s_sub_u32 s27, s24, s14
-; GCN-NEXT:    s_cselect_b64 s[22:23], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[22:23], 0
+; GCN-NEXT:    s_sub_u32 s22, s24, s14
 ; GCN-NEXT:    s_subb_u32 s18, s18, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s26, 0
-; GCN-NEXT:    s_cselect_b32 s22, s27, s24
+; GCN-NEXT:    s_cselect_b32 s22, s22, s24
 ; GCN-NEXT:    s_cselect_b32 s18, s18, s25
 ; GCN-NEXT:    s_cmp_lg_u64 s[20:21], 0
 ; GCN-NEXT:    s_subb_u32 s11, s19, s11
@@ -5420,7 +5364,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s10
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s11
 ; GCN-NEXT:    s_sub_u32 s3, 0, s10
-; GCN-NEXT:    s_subb_u32 s14, 0, s11
+; GCN-NEXT:    s_subb_u32 s12, 0, s11
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -5429,56 +5373,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s15, v1
-; GCN-NEXT:    v_readfirstlane_b32 s12, v0
-; GCN-NEXT:    s_mul_i32 s13, s3, s15
-; GCN-NEXT:    s_mul_hi_u32 s17, s3, s12
-; GCN-NEXT:    s_mul_i32 s16, s14, s12
-; GCN-NEXT:    s_add_i32 s13, s17, s13
-; GCN-NEXT:    s_add_i32 s13, s13, s16
-; GCN-NEXT:    s_mul_i32 s18, s3, s12
-; GCN-NEXT:    s_mul_i32 s17, s12, s13
-; GCN-NEXT:    s_mul_hi_u32 s19, s12, s18
-; GCN-NEXT:    s_mul_hi_u32 s16, s12, s13
+; GCN-NEXT:    v_readfirstlane_b32 s13, v1
+; GCN-NEXT:    v_readfirstlane_b32 s14, v0
+; GCN-NEXT:    s_mul_i32 s15, s3, s13
+; GCN-NEXT:    s_mul_hi_u32 s17, s3, s14
+; GCN-NEXT:    s_mul_i32 s16, s12, s14
+; GCN-NEXT:    s_add_i32 s15, s17, s15
+; GCN-NEXT:    s_add_i32 s15, s15, s16
+; GCN-NEXT:    s_mul_i32 s18, s3, s14
+; GCN-NEXT:    s_mul_i32 s17, s14, s15
+; GCN-NEXT:    s_mul_hi_u32 s19, s14, s18
+; GCN-NEXT:    s_mul_hi_u32 s16, s14, s15
 ; GCN-NEXT:    s_add_u32 s17, s19, s17
 ; GCN-NEXT:    s_addc_u32 s16, 0, s16
-; GCN-NEXT:    s_mul_hi_u32 s20, s15, s18
-; GCN-NEXT:    s_mul_i32 s18, s15, s18
+; GCN-NEXT:    s_mul_hi_u32 s20, s13, s18
+; GCN-NEXT:    s_mul_i32 s18, s13, s18
 ; GCN-NEXT:    s_add_u32 s17, s17, s18
-; GCN-NEXT:    s_mul_hi_u32 s19, s15, s13
+; GCN-NEXT:    s_mul_hi_u32 s19, s13, s15
 ; GCN-NEXT:    s_addc_u32 s16, s16, s20
 ; GCN-NEXT:    s_addc_u32 s17, s19, 0
-; GCN-NEXT:    s_mul_i32 s13, s15, s13
-; GCN-NEXT:    s_add_u32 s13, s16, s13
+; GCN-NEXT:    s_mul_i32 s15, s13, s15
+; GCN-NEXT:    s_add_u32 s15, s16, s15
 ; GCN-NEXT:    s_addc_u32 s16, 0, s17
-; GCN-NEXT:    s_add_u32 s17, s12, s13
-; GCN-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[12:13], 0
-; GCN-NEXT:    s_addc_u32 s15, s15, s16
-; GCN-NEXT:    s_mul_i32 s12, s3, s15
-; GCN-NEXT:    s_mul_hi_u32 s13, s3, s17
-; GCN-NEXT:    s_add_i32 s12, s13, s12
-; GCN-NEXT:    s_mul_i32 s14, s14, s17
-; GCN-NEXT:    s_add_i32 s12, s12, s14
-; GCN-NEXT:    s_mul_i32 s3, s3, s17
-; GCN-NEXT:    s_mul_hi_u32 s14, s15, s3
-; GCN-NEXT:    s_mul_i32 s16, s15, s3
-; GCN-NEXT:    s_mul_i32 s19, s17, s12
-; GCN-NEXT:    s_mul_hi_u32 s3, s17, s3
-; GCN-NEXT:    s_mul_hi_u32 s18, s17, s12
+; GCN-NEXT:    s_add_u32 s14, s14, s15
+; GCN-NEXT:    s_addc_u32 s13, s13, s16
+; GCN-NEXT:    s_mul_i32 s15, s3, s13
+; GCN-NEXT:    s_mul_hi_u32 s16, s3, s14
+; GCN-NEXT:    s_add_i32 s15, s16, s15
+; GCN-NEXT:    s_mul_i32 s12, s12, s14
+; GCN-NEXT:    s_add_i32 s15, s15, s12
+; GCN-NEXT:    s_mul_i32 s3, s3, s14
+; GCN-NEXT:    s_mul_hi_u32 s16, s13, s3
+; GCN-NEXT:    s_mul_i32 s17, s13, s3
+; GCN-NEXT:    s_mul_i32 s19, s14, s15
+; GCN-NEXT:    s_mul_hi_u32 s3, s14, s3
+; GCN-NEXT:    s_mul_hi_u32 s18, s14, s15
 ; GCN-NEXT:    s_add_u32 s3, s3, s19
 ; GCN-NEXT:    s_addc_u32 s18, 0, s18
-; GCN-NEXT:    s_add_u32 s3, s3, s16
-; GCN-NEXT:    s_mul_hi_u32 s13, s15, s12
-; GCN-NEXT:    s_addc_u32 s3, s18, s14
-; GCN-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-NEXT:    s_mul_i32 s12, s15, s12
-; GCN-NEXT:    s_add_u32 s3, s3, s12
-; GCN-NEXT:    s_addc_u32 s14, 0, s13
-; GCN-NEXT:    s_add_u32 s3, s17, s3
-; GCN-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[12:13], 0
-; GCN-NEXT:    s_addc_u32 s16, s15, s14
+; GCN-NEXT:    s_add_u32 s3, s3, s17
+; GCN-NEXT:    s_mul_hi_u32 s12, s13, s15
+; GCN-NEXT:    s_addc_u32 s3, s18, s16
+; GCN-NEXT:    s_addc_u32 s12, s12, 0
+; GCN-NEXT:    s_mul_i32 s15, s13, s15
+; GCN-NEXT:    s_add_u32 s3, s3, s15
+; GCN-NEXT:    s_addc_u32 s12, 0, s12
+; GCN-NEXT:    s_add_u32 s3, s14, s3
+; GCN-NEXT:    s_addc_u32 s16, s13, s12
 ; GCN-NEXT:    s_ashr_i32 s12, s5, 31
 ; GCN-NEXT:    s_add_u32 s14, s4, s12
 ; GCN-NEXT:    s_mov_b32 s13, s12
@@ -5507,11 +5447,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_mul_i32 s3, s10, s3
 ; GCN-NEXT:    s_sub_u32 s3, s14, s3
 ; GCN-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; GCN-NEXT:    s_subb_u32 s14, s18, s11
 ; GCN-NEXT:    s_sub_u32 s20, s3, s10
 ; GCN-NEXT:    s_cselect_b64 s[18:19], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[18:19], 0
 ; GCN-NEXT:    s_subb_u32 s21, s14, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s21, s11
 ; GCN-NEXT:    s_cselect_b32 s22, -1, 0
@@ -5521,12 +5459,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_cselect_b32 s22, s23, s22
 ; GCN-NEXT:    s_cmp_lg_u64 s[18:19], 0
 ; GCN-NEXT:    s_subb_u32 s14, s14, s11
-; GCN-NEXT:    s_sub_u32 s23, s20, s10
-; GCN-NEXT:    s_cselect_b64 s[18:19], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[18:19], 0
+; GCN-NEXT:    s_sub_u32 s18, s20, s10
 ; GCN-NEXT:    s_subb_u32 s14, s14, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s22, 0
-; GCN-NEXT:    s_cselect_b32 s18, s23, s20
+; GCN-NEXT:    s_cselect_b32 s18, s18, s20
 ; GCN-NEXT:    s_cselect_b32 s14, s14, s21
 ; GCN-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; GCN-NEXT:    s_subb_u32 s5, s15, s5
@@ -6299,11 +6235,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_readfirstlane_b32 s14, v8
 ; TONGA-NEXT:    s_sub_u32 s12, s12, s14
 ; TONGA-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; TONGA-NEXT:    s_subb_u32 s1, s1, s7
 ; TONGA-NEXT:    s_sub_u32 s18, s12, s6
 ; TONGA-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; TONGA-NEXT:    s_subb_u32 s19, s1, 0
 ; TONGA-NEXT:    s_cmp_ge_u32 s19, s7
 ; TONGA-NEXT:    s_cselect_b32 s20, -1, 0
@@ -6313,12 +6247,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    s_cselect_b32 s20, s21, s20
 ; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; TONGA-NEXT:    s_subb_u32 s1, s1, s7
-; TONGA-NEXT:    s_sub_u32 s21, s18, s6
-; TONGA-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
+; TONGA-NEXT:    s_sub_u32 s16, s18, s6
 ; TONGA-NEXT:    s_subb_u32 s1, s1, 0
 ; TONGA-NEXT:    s_cmp_lg_u32 s20, 0
-; TONGA-NEXT:    s_cselect_b32 s16, s21, s18
+; TONGA-NEXT:    s_cselect_b32 s16, s16, s18
 ; TONGA-NEXT:    s_cselect_b32 s1, s1, s19
 ; TONGA-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; TONGA-NEXT:    s_subb_u32 s3, s13, s3
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 33b0a5d..ea9bb04 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -51,10 +51,9 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_addc_u32 s13, 0, s14
 ; GCN-NEXT:    s_add_u32 s14, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s10, v0
+; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
 ; GCN-NEXT:    s_addc_u32 s12, s12, s13
 ; GCN-NEXT:    s_mul_i32 s0, s10, s12
 ; GCN-NEXT:    v_readfirstlane_b32 s1, v0
@@ -85,7 +84,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_add_u32 s11, s14, s0
 ; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
 ; GCN-NEXT:    s_addc_u32 s1, s12, s10
 ; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    v_mul_hi_u32 v1, s6, v0
@@ -115,46 +113,43 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_readfirstlane_b32 s10, v0
 ; GCN-NEXT:    s_add_i32 s5, s10, s5
 ; GCN-NEXT:    s_mul_i32 s10, s9, s4
-; GCN-NEXT:    s_add_i32 s10, s5, s10
-; GCN-NEXT:    s_sub_i32 s11, s7, s10
+; GCN-NEXT:    s_add_i32 s12, s5, s10
+; GCN-NEXT:    s_sub_i32 s10, s7, s12
 ; GCN-NEXT:    s_mul_i32 s4, s8, s4
 ; GCN-NEXT:    s_sub_u32 s6, s6, s4
 ; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s12, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s12, 0
-; GCN-NEXT:    s_subb_u32 s11, s11, s9
-; GCN-NEXT:    s_sub_u32 s13, s6, s8
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT:    s_or_b32 s11, s4, s5
+; GCN-NEXT:    s_subb_u32 s13, s10, s9
+; GCN-NEXT:    s_sub_u32 s14, s6, s8
+; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT:    s_or_b32 s15, s10, s11
+; GCN-NEXT:    s_subb_u32 s15, s13, 0
+; GCN-NEXT:    s_cmp_ge_u32 s15, s9
+; GCN-NEXT:    s_cselect_b32 s16, -1, 0
+; GCN-NEXT:    s_cmp_ge_u32 s14, s8
+; GCN-NEXT:    s_cselect_b32 s17, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s15, s9
+; GCN-NEXT:    s_cselect_b32 s16, s17, s16
+; GCN-NEXT:    s_or_b32 s10, s10, s11
+; GCN-NEXT:    s_subb_u32 s13, s13, s9
+; GCN-NEXT:    s_sub_u32 s17, s14, s8
+; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT:    s_or_b32 s10, s10, s11
+; GCN-NEXT:    s_subb_u32 s10, s13, 0
+; GCN-NEXT:    s_cmp_lg_u32 s16, 0
+; GCN-NEXT:    s_cselect_b32 s11, s17, s14
+; GCN-NEXT:    s_cselect_b32 s10, s10, s15
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_subb_u32 s14, s11, 0
-; GCN-NEXT:    s_cmp_ge_u32 s14, s9
+; GCN-NEXT:    s_subb_u32 s4, s7, s12
+; GCN-NEXT:    s_cmp_ge_u32 s4, s9
 ; GCN-NEXT:    s_cselect_b32 s5, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s13, s8
-; GCN-NEXT:    s_cselect_b32 s15, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s14, s9
-; GCN-NEXT:    s_cselect_b32 s15, s15, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_subb_u32 s11, s11, s9
-; GCN-NEXT:    s_sub_u32 s16, s13, s8
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_subb_u32 s4, s11, 0
-; GCN-NEXT:    s_cmp_lg_u32 s15, 0
-; GCN-NEXT:    s_cselect_b32 s5, s16, s13
-; GCN-NEXT:    s_cselect_b32 s4, s4, s14
-; GCN-NEXT:    s_cmp_lg_u32 s12, 0
-; GCN-NEXT:    s_subb_u32 s7, s7, s10
-; GCN-NEXT:    s_cmp_ge_u32 s7, s9
-; GCN-NEXT:    s_cselect_b32 s10, -1, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s6, s8
-; GCN-NEXT:    s_cselect_b32 s8, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s7, s9
-; GCN-NEXT:    s_cselect_b32 s8, s8, s10
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    s_cselect_b32 s4, s4, s7
-; GCN-NEXT:    s_cselect_b32 s5, s5, s6
+; GCN-NEXT:    s_cselect_b32 s7, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s4, s9
+; GCN-NEXT:    s_cselect_b32 s5, s7, s5
+; GCN-NEXT:    s_cmp_lg_u32 s5, 0
+; GCN-NEXT:    s_cselect_b32 s4, s10, s4
+; GCN-NEXT:    s_cselect_b32 s5, s11, s6
 ; GCN-NEXT:    v_mov_b32_e32 v0, s5
 ; GCN-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -187,7 +182,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_add_u32 s14, s12, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s8, s8, s9
-; GCN-IR-NEXT:    s_cmp_lg_u32 s8, 0
 ; GCN-IR-NEXT:    s_addc_u32 s8, s13, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s12, 63, s12
@@ -221,7 +215,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_add_u32 s16, s16, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s18, s18, s19
-; GCN-IR-NEXT:    s_cmp_lg_u32 s18, 0
 ; GCN-IR-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[4:5]
@@ -1016,10 +1009,9 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    s_addc_u32 s13, 0, s14
 ; GCN-NEXT:    s_add_u32 s14, s8, s9
 ; GCN-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s10, v0
+; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
 ; GCN-NEXT:    s_addc_u32 s12, s12, s13
 ; GCN-NEXT:    s_mul_i32 s8, s10, s12
 ; GCN-NEXT:    v_readfirstlane_b32 s9, v0
@@ -1050,7 +1042,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    s_add_u32 s11, s14, s8
 ; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
 ; GCN-NEXT:    s_addc_u32 s10, s12, s10
 ; GCN-NEXT:    s_ashr_i32 s8, s7, 31
 ; GCN-NEXT:    s_add_u32 s6, s6, s8
@@ -1083,46 +1074,43 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    v_readfirstlane_b32 s12, v0
 ; GCN-NEXT:    s_add_i32 s11, s12, s11
 ; GCN-NEXT:    s_mul_i32 s12, s5, s10
-; GCN-NEXT:    s_add_i32 s12, s11, s12
-; GCN-NEXT:    s_sub_i32 s13, s7, s12
+; GCN-NEXT:    s_add_i32 s14, s11, s12
+; GCN-NEXT:    s_sub_i32 s12, s7, s14
 ; GCN-NEXT:    s_mul_i32 s10, s4, s10
 ; GCN-NEXT:    s_sub_u32 s6, s6, s10
 ; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT:    s_or_b32 s14, s10, s11
-; GCN-NEXT:    s_cmp_lg_u32 s14, 0
-; GCN-NEXT:    s_subb_u32 s13, s13, s5
-; GCN-NEXT:    s_sub_u32 s15, s6, s4
-; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT:    s_or_b32 s10, s10, s11
-; GCN-NEXT:    s_cmp_lg_u32 s10, 0
-; GCN-NEXT:    s_subb_u32 s16, s13, 0
-; GCN-NEXT:    s_cmp_ge_u32 s16, s5
-; GCN-NEXT:    s_cselect_b32 s11, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s15, s4
-; GCN-NEXT:    s_cselect_b32 s17, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s16, s5
-; GCN-NEXT:    s_cselect_b32 s17, s17, s11
-; GCN-NEXT:    s_cmp_lg_u32 s10, 0
-; GCN-NEXT:    s_subb_u32 s13, s13, s5
-; GCN-NEXT:    s_sub_u32 s18, s15, s4
-; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT:    s_or_b32 s13, s10, s11
+; GCN-NEXT:    s_subb_u32 s15, s12, s5
+; GCN-NEXT:    s_sub_u32 s16, s6, s4
+; GCN-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GCN-NEXT:    s_or_b32 s17, s12, s13
+; GCN-NEXT:    s_subb_u32 s17, s15, 0
+; GCN-NEXT:    s_cmp_ge_u32 s17, s5
+; GCN-NEXT:    s_cselect_b32 s18, -1, 0
+; GCN-NEXT:    s_cmp_ge_u32 s16, s4
+; GCN-NEXT:    s_cselect_b32 s19, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s17, s5
+; GCN-NEXT:    s_cselect_b32 s18, s19, s18
+; GCN-NEXT:    s_or_b32 s12, s12, s13
+; GCN-NEXT:    s_subb_u32 s15, s15, s5
+; GCN-NEXT:    s_sub_u32 s19, s16, s4
+; GCN-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GCN-NEXT:    s_or_b32 s12, s12, s13
+; GCN-NEXT:    s_subb_u32 s12, s15, 0
+; GCN-NEXT:    s_cmp_lg_u32 s18, 0
+; GCN-NEXT:    s_cselect_b32 s13, s19, s16
+; GCN-NEXT:    s_cselect_b32 s12, s12, s17
 ; GCN-NEXT:    s_or_b32 s10, s10, s11
-; GCN-NEXT:    s_cmp_lg_u32 s10, 0
-; GCN-NEXT:    s_subb_u32 s10, s13, 0
-; GCN-NEXT:    s_cmp_lg_u32 s17, 0
-; GCN-NEXT:    s_cselect_b32 s11, s18, s15
-; GCN-NEXT:    s_cselect_b32 s10, s10, s16
-; GCN-NEXT:    s_cmp_lg_u32 s14, 0
-; GCN-NEXT:    s_subb_u32 s7, s7, s12
+; GCN-NEXT:    s_subb_u32 s7, s7, s14
 ; GCN-NEXT:    s_cmp_ge_u32 s7, s5
-; GCN-NEXT:    s_cselect_b32 s12, -1, 0
+; GCN-NEXT:    s_cselect_b32 s10, -1, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s6, s4
 ; GCN-NEXT:    s_cselect_b32 s4, -1, 0
 ; GCN-NEXT:    s_cmp_eq_u32 s7, s5
-; GCN-NEXT:    s_cselect_b32 s4, s4, s12
+; GCN-NEXT:    s_cselect_b32 s4, s4, s10
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_cselect_b32 s5, s10, s7
-; GCN-NEXT:    s_cselect_b32 s4, s11, s6
+; GCN-NEXT:    s_cselect_b32 s5, s12, s7
+; GCN-NEXT:    s_cselect_b32 s4, s13, s6
 ; GCN-NEXT:    s_xor_b64 s[4:5], s[4:5], s[8:9]
 ; GCN-NEXT:    s_sub_u32 s4, s4, s8
 ; GCN-NEXT:    s_subb_u32 s5, s5, s8
@@ -1170,7 +1158,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_add_u32 s16, s14, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s10, s10, s11
-; GCN-IR-NEXT:    s_cmp_lg_u32 s10, 0
 ; GCN-IR-NEXT:    s_addc_u32 s10, s15, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s14, 63, s14
@@ -1204,7 +1191,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_add_u32 s18, s18, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[20:21], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s20, s20, s21
-; GCN-IR-NEXT:    s_cmp_lg_u32 s20, 0
 ; GCN-IR-NEXT:    s_addc_u32 s19, s19, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[20:21], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[12:13], s[2:3]
@@ -1369,10 +1355,9 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_addc_u32 s10, 0, s11
 ; GCN-NEXT:    s_add_u32 s11, s6, s7
 ; GCN-NEXT:    v_mov_b32_e32 v0, s11
-; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-NEXT:    s_or_b32 s6, s6, s7
-; GCN-NEXT:    s_cmp_lg_u32 s6, 0
 ; GCN-NEXT:    s_addc_u32 s9, s9, s10
 ; GCN-NEXT:    s_mul_i32 s6, s2, s9
 ; GCN-NEXT:    v_readfirstlane_b32 s7, v0
@@ -1403,7 +1388,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_add_u32 s2, s11, s2
 ; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-NEXT:    s_or_b32 s6, s6, s7
-; GCN-NEXT:    s_cmp_lg_u32 s6, 0
 ; GCN-NEXT:    s_addc_u32 s6, s9, s8
 ; GCN-NEXT:    v_mul_hi_u32 v1, s2, 24
 ; GCN-NEXT:    v_mul_hi_u32 v0, s6, 24
@@ -1418,45 +1402,42 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_mul_i32 s7, s5, s6
 ; GCN-NEXT:    s_mul_i32 s6, s4, s6
 ; GCN-NEXT:    v_readfirstlane_b32 s8, v0
-; GCN-NEXT:    s_add_i32 s8, s8, s7
-; GCN-NEXT:    s_sub_i32 s9, 0, s8
-; GCN-NEXT:    s_sub_u32 s10, 24, s6
-; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GCN-NEXT:    s_or_b32 s11, s6, s7
-; GCN-NEXT:    s_cmp_lg_u32 s11, 0
-; GCN-NEXT:    s_subb_u32 s9, s9, s5
-; GCN-NEXT:    s_sub_u32 s12, s10, s4
+; GCN-NEXT:    s_add_i32 s10, s8, s7
+; GCN-NEXT:    s_sub_i32 s8, 0, s10
+; GCN-NEXT:    s_sub_u32 s11, 24, s6
 ; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; GCN-NEXT:    s_or_b32 s9, s6, s7
+; GCN-NEXT:    s_subb_u32 s12, s8, s5
+; GCN-NEXT:    s_sub_u32 s13, s11, s4
+; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT:    s_or_b32 s14, s8, s9
+; GCN-NEXT:    s_subb_u32 s14, s12, 0
+; GCN-NEXT:    s_cmp_ge_u32 s14, s5
+; GCN-NEXT:    s_cselect_b32 s15, -1, 0
+; GCN-NEXT:    s_cmp_ge_u32 s13, s4
+; GCN-NEXT:    s_cselect_b32 s16, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s14, s5
+; GCN-NEXT:    s_cselect_b32 s15, s16, s15
+; GCN-NEXT:    s_or_b32 s8, s8, s9
+; GCN-NEXT:    s_subb_u32 s12, s12, s5
+; GCN-NEXT:    s_sub_u32 s16, s13, s4
+; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT:    s_or_b32 s8, s8, s9
+; GCN-NEXT:    s_subb_u32 s8, s12, 0
+; GCN-NEXT:    s_cmp_lg_u32 s15, 0
+; GCN-NEXT:    s_cselect_b32 s9, s16, s13
+; GCN-NEXT:    s_cselect_b32 s8, s8, s14
 ; GCN-NEXT:    s_or_b32 s6, s6, s7
-; GCN-NEXT:    s_cmp_lg_u32 s6, 0
-; GCN-NEXT:    s_subb_u32 s13, s9, 0
-; GCN-NEXT:    s_cmp_ge_u32 s13, s5
+; GCN-NEXT:    s_subb_u32 s6, 0, s10
+; GCN-NEXT:    s_cmp_ge_u32 s6, s5
 ; GCN-NEXT:    s_cselect_b32 s7, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s12, s4
-; GCN-NEXT:    s_cselect_b32 s14, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s13, s5
-; GCN-NEXT:    s_cselect_b32 s14, s14, s7
-; GCN-NEXT:    s_cmp_lg_u32 s6, 0
-; GCN-NEXT:    s_subb_u32 s9, s9, s5
-; GCN-NEXT:    s_sub_u32 s15, s12, s4
-; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GCN-NEXT:    s_or_b32 s6, s6, s7
-; GCN-NEXT:    s_cmp_lg_u32 s6, 0
-; GCN-NEXT:    s_subb_u32 s6, s9, 0
-; GCN-NEXT:    s_cmp_lg_u32 s14, 0
-; GCN-NEXT:    s_cselect_b32 s7, s15, s12
-; GCN-NEXT:    s_cselect_b32 s6, s6, s13
-; GCN-NEXT:    s_cmp_lg_u32 s11, 0
-; GCN-NEXT:    s_subb_u32 s8, 0, s8
-; GCN-NEXT:    s_cmp_ge_u32 s8, s5
-; GCN-NEXT:    s_cselect_b32 s9, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s10, s4
+; GCN-NEXT:    s_cmp_ge_u32 s11, s4
 ; GCN-NEXT:    s_cselect_b32 s4, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s8, s5
-; GCN-NEXT:    s_cselect_b32 s4, s4, s9
+; GCN-NEXT:    s_cmp_eq_u32 s6, s5
+; GCN-NEXT:    s_cselect_b32 s4, s4, s7
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_cselect_b32 s4, s6, s8
-; GCN-NEXT:    s_cselect_b32 s5, s7, s10
+; GCN-NEXT:    s_cselect_b32 s4, s8, s6
+; GCN-NEXT:    s_cselect_b32 s5, s9, s11
 ; GCN-NEXT:    v_mov_b32_e32 v0, s5
 ; GCN-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -1489,7 +1470,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s8, s2, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s9, s10, s11
-; GCN-IR-NEXT:    s_cmp_lg_u32 s9, 0
 ; GCN-IR-NEXT:    s_addc_u32 s3, s3, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s2, 63, s2
@@ -1522,7 +1502,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s14, s14, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s16, s16, s17
-; GCN-IR-NEXT:    s_cmp_lg_u32 s16, 0
 ; GCN-IR-NEXT:    s_addc_u32 s15, s15, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/stackguard.ll b/llvm/test/CodeGen/AMDGPU/stackguard.ll
new file mode 100644
index 0000000..393686f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/stackguard.ll
@@ -0,0 +1,14 @@
+; RUN: not llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s
+; RUN: not llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s
+
+; FIXME: To actually support stackguard, need to fix intrinsic to
+; return pointer in any address space.
+
+; CHECK: error: unable to lower stackguard
+define i1 @test_stackguard(ptr %p1) {
+  %p2 = call ptr @llvm.stackguard()
+  %res = icmp ne ptr %p2, %p1
+  ret i1 %res
+}
+
+declare ptr @llvm.stackguard()
diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index bb5918b2..bdd22f25 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -18,7 +18,6 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    s_or_b32 s0, s0, s1
-; SI-NEXT:    s_cmp_lg_u32 s0, 0
 ; SI-NEXT:    s_addc_u32 s3, s3, s9
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -35,10 +34,8 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    s_add_u32 s2, s2, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; VI-NEXT:    s_addc_u32 s3, s3, s5
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
@@ -53,14 +50,12 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s6, s2, s6
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s4, s3, s7
+; GFX9-NEXT:    s_add_u32 s4, s2, s6
+; GFX9-NEXT:    s_addc_u32 s5, s3, s7
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
@@ -73,8 +68,6 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_add_u32 s2, s2, s6
-; GFX10-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX10-NEXT:    s_addc_u32 s3, s3, s7
 ; GFX10-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
@@ -91,14 +84,12 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_add_u32 s2, s2, s4
-; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX11-NEXT:    s_addc_u32 s3, s3, s5
 ; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_u32 v0, s2, s2, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s3, 0, s2
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
@@ -444,7 +435,6 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_add_u32 s4, s4, s6
 ; SI-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; SI-NEXT:    s_or_b32 s6, s12, s13
-; SI-NEXT:    s_cmp_lg_u32 s6, 0
 ; SI-NEXT:    s_addc_u32 s5, s5, s7
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
@@ -465,16 +455,14 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    s_add_u32 s2, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_add_u32 s0, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_addc_u32 s1, s5, s7
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; VI-NEXT:    s_addc_u32 s0, s5, s7
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    v_mov_b32_e32 v5, s0
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -486,12 +474,10 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s2, s12, s14
-; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    s_addc_u32 s0, s13, s15
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_add_u32 s0, s12, s14
+; GFX9-NEXT:    s_addc_u32 s1, s13, s15
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -504,10 +490,8 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_add_u32 s0, s12, s14
-; GFX10-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX10-NEXT:    s_addc_u32 s1, s13, s15
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
@@ -520,10 +504,8 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_add_u32 s4, s4, s6
-; GFX11-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s4
-; GFX11-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX11-NEXT:    s_addc_u32 s5, s5, s7
+; GFX11-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index 41199b0..fd461ac 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -148,7 +148,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    s_add_u32 s14, s12, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s8, s8, s9
-; GCN-IR-NEXT:    s_cmp_lg_u32 s8, 0
 ; GCN-IR-NEXT:    s_addc_u32 s8, s13, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s12, 63, s12
@@ -182,7 +181,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    s_add_u32 s10, s10, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s16, s16, s17
-; GCN-IR-NEXT:    s_cmp_lg_u32 s16, 0
 ; GCN-IR-NEXT:    s_addc_u32 s11, s11, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[2:3], s[4:5]
@@ -831,10 +829,9 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_addc_u32 s10, 0, s11
 ; GCN-NEXT:    s_add_u32 s11, s4, s5
 ; GCN-NEXT:    v_mov_b32_e32 v0, s11
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s6, v0
+; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
 ; GCN-NEXT:    s_addc_u32 s9, s9, s10
 ; GCN-NEXT:    s_mul_i32 s4, s6, s9
 ; GCN-NEXT:    v_readfirstlane_b32 s5, v0
@@ -865,7 +862,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_add_u32 s8, s11, s4
 ; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
 ; GCN-NEXT:    s_addc_u32 s4, s9, s6
 ; GCN-NEXT:    v_mul_hi_u32 v1, s8, 24
 ; GCN-NEXT:    v_mul_hi_u32 v0, s4, 24
@@ -874,52 +870,50 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    v_readfirstlane_b32 s8, v1
 ; GCN-NEXT:    v_readfirstlane_b32 s5, v0
 ; GCN-NEXT:    s_add_u32 s4, s8, s4
-; GCN-NEXT:    s_addc_u32 s8, 0, s5
-; GCN-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NEXT:    s_addc_u32 s10, 0, s5
+; GCN-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    s_mul_i32 s0, s3, s8
+; GCN-NEXT:    s_mul_i32 s0, s3, s10
 ; GCN-NEXT:    v_readfirstlane_b32 s1, v0
-; GCN-NEXT:    s_add_i32 s9, s1, s0
-; GCN-NEXT:    s_sub_i32 s10, 0, s9
-; GCN-NEXT:    s_mul_i32 s0, s2, s8
-; GCN-NEXT:    s_sub_u32 s11, 24, s0
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_or_b32 s12, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s12, 0
-; GCN-NEXT:    s_subb_u32 s10, s10, s3
-; GCN-NEXT:    s_sub_u32 s13, s11, s2
+; GCN-NEXT:    s_add_i32 s11, s1, s0
+; GCN-NEXT:    s_sub_i32 s8, 0, s11
+; GCN-NEXT:    s_mul_i32 s0, s2, s10
+; GCN-NEXT:    s_sub_u32 s12, 24, s0
 ; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT:    s_or_b32 s9, s0, s1
+; GCN-NEXT:    s_subb_u32 s13, s8, s3
+; GCN-NEXT:    s_sub_u32 s14, s12, s2
+; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT:    s_or_b32 s8, s8, s9
+; GCN-NEXT:    s_subb_u32 s8, s13, 0
+; GCN-NEXT:    s_cmp_ge_u32 s8, s3
+; GCN-NEXT:    s_cselect_b32 s9, -1, 0
+; GCN-NEXT:    s_cmp_ge_u32 s14, s2
+; GCN-NEXT:    s_cselect_b32 s13, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s8, s3
+; GCN-NEXT:    s_cselect_b32 s8, s13, s9
+; GCN-NEXT:    s_add_u32 s9, s10, 1
+; GCN-NEXT:    s_addc_u32 s13, 0, 0
+; GCN-NEXT:    s_add_u32 s14, s10, 2
+; GCN-NEXT:    s_addc_u32 s15, 0, 0
+; GCN-NEXT:    s_cmp_lg_u32 s8, 0
+; GCN-NEXT:    s_cselect_b32 s8, s14, s9
+; GCN-NEXT:    s_cselect_b32 s9, s15, s13
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_subb_u32 s0, s10, 0
+; GCN-NEXT:    s_subb_u32 s0, 0, s11
 ; GCN-NEXT:    s_cmp_ge_u32 s0, s3
 ; GCN-NEXT:    s_cselect_b32 s1, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s13, s2
-; GCN-NEXT:    s_cselect_b32 s10, -1, 0
+; GCN-NEXT:    s_cmp_ge_u32 s12, s2
+; GCN-NEXT:    s_cselect_b32 s2, -1, 0
 ; GCN-NEXT:    s_cmp_eq_u32 s0, s3
-; GCN-NEXT:    s_cselect_b32 s0, s10, s1
-; GCN-NEXT:    s_add_u32 s1, s8, 1
-; GCN-NEXT:    s_addc_u32 s10, 0, 0
-; GCN-NEXT:    s_add_u32 s13, s8, 2
-; GCN-NEXT:    s_addc_u32 s14, 0, 0
+; GCN-NEXT:    s_cselect_b32 s0, s2, s1
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_cselect_b32 s0, s13, s1
-; GCN-NEXT:    s_cselect_b32 s1, s14, s10
-; GCN-NEXT:    s_cmp_lg_u32 s12, 0
-; GCN-NEXT:    s_subb_u32 s9, 0, s9
-; GCN-NEXT:    s_cmp_ge_u32 s9, s3
-; GCN-NEXT:    s_cselect_b32 s10, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s11, s2
-; GCN-NEXT:    s_cselect_b32 s2, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s9, s3
-; GCN-NEXT:    s_cselect_b32 s2, s2, s10
-; GCN-NEXT:    s_cmp_lg_u32 s2, 0
-; GCN-NEXT:    s_cselect_b32 s1, s1, 0
-; GCN-NEXT:    s_cselect_b32 s0, s0, s8
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    s_cselect_b32 s0, s9, 0
+; GCN-NEXT:    s_cselect_b32 s1, s8, s10
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 ;
@@ -945,7 +939,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s10, s8, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s6, s6, s7
-; GCN-IR-NEXT:    s_cmp_lg_u32 s6, 0
 ; GCN-IR-NEXT:    s_addc_u32 s6, s9, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
@@ -978,7 +971,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s14, s14, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s16, s16, s17
-; GCN-IR-NEXT:    s_cmp_lg_u32 s16, 0
 ; GCN-IR-NEXT:    s_addc_u32 s15, s15, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], s[4:5]
@@ -1317,7 +1309,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s11, s8, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s6, s6, s7
-; GCN-IR-NEXT:    s_cmp_lg_u32 s6, 0
 ; GCN-IR-NEXT:    s_addc_u32 s6, s9, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
@@ -1347,7 +1338,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s10, s10, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s12, s12, s13
-; GCN-IR-NEXT:    s_cmp_lg_u32 s12, 0
 ; GCN-IR-NEXT:    s_addc_u32 s11, s11, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index cdcc914..137dc1f 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -51,10 +51,9 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-NEXT:    s_addc_u32 s13, 0, s14
 ; GCN-NEXT:    s_add_u32 s14, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s10, v0
+; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
 ; GCN-NEXT:    s_addc_u32 s12, s12, s13
 ; GCN-NEXT:    s_mul_i32 s0, s10, s12
 ; GCN-NEXT:    v_readfirstlane_b32 s1, v0
@@ -85,7 +84,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-NEXT:    s_add_u32 s11, s14, s0
 ; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
 ; GCN-NEXT:    s_addc_u32 s1, s12, s10
 ; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    v_mul_hi_u32 v1, s6, v0
@@ -115,46 +113,43 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-NEXT:    v_readfirstlane_b32 s10, v0
 ; GCN-NEXT:    s_add_i32 s5, s10, s5
 ; GCN-NEXT:    s_mul_i32 s10, s9, s4
-; GCN-NEXT:    s_add_i32 s10, s5, s10
-; GCN-NEXT:    s_sub_i32 s11, s7, s10
+; GCN-NEXT:    s_add_i32 s12, s5, s10
+; GCN-NEXT:    s_sub_i32 s10, s7, s12
 ; GCN-NEXT:    s_mul_i32 s4, s8, s4
 ; GCN-NEXT:    s_sub_u32 s6, s6, s4
 ; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s12, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s12, 0
-; GCN-NEXT:    s_subb_u32 s11, s11, s9
-; GCN-NEXT:    s_sub_u32 s13, s6, s8
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT:    s_or_b32 s11, s4, s5
+; GCN-NEXT:    s_subb_u32 s13, s10, s9
+; GCN-NEXT:    s_sub_u32 s14, s6, s8
+; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT:    s_or_b32 s15, s10, s11
+; GCN-NEXT:    s_subb_u32 s15, s13, 0
+; GCN-NEXT:    s_cmp_ge_u32 s15, s9
+; GCN-NEXT:    s_cselect_b32 s16, -1, 0
+; GCN-NEXT:    s_cmp_ge_u32 s14, s8
+; GCN-NEXT:    s_cselect_b32 s17, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s15, s9
+; GCN-NEXT:    s_cselect_b32 s16, s17, s16
+; GCN-NEXT:    s_or_b32 s10, s10, s11
+; GCN-NEXT:    s_subb_u32 s13, s13, s9
+; GCN-NEXT:    s_sub_u32 s17, s14, s8
+; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT:    s_or_b32 s10, s10, s11
+; GCN-NEXT:    s_subb_u32 s10, s13, 0
+; GCN-NEXT:    s_cmp_lg_u32 s16, 0
+; GCN-NEXT:    s_cselect_b32 s11, s17, s14
+; GCN-NEXT:    s_cselect_b32 s10, s10, s15
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_subb_u32 s14, s11, 0
-; GCN-NEXT:    s_cmp_ge_u32 s14, s9
+; GCN-NEXT:    s_subb_u32 s4, s7, s12
+; GCN-NEXT:    s_cmp_ge_u32 s4, s9
 ; GCN-NEXT:    s_cselect_b32 s5, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s13, s8
-; GCN-NEXT:    s_cselect_b32 s15, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s14, s9
-; GCN-NEXT:    s_cselect_b32 s15, s15, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_subb_u32 s11, s11, s9
-; GCN-NEXT:    s_sub_u32 s16, s13, s8
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_subb_u32 s4, s11, 0
-; GCN-NEXT:    s_cmp_lg_u32 s15, 0
-; GCN-NEXT:    s_cselect_b32 s5, s16, s13
-; GCN-NEXT:    s_cselect_b32 s4, s4, s14
-; GCN-NEXT:    s_cmp_lg_u32 s12, 0
-; GCN-NEXT:    s_subb_u32 s7, s7, s10
-; GCN-NEXT:    s_cmp_ge_u32 s7, s9
-; GCN-NEXT:    s_cselect_b32 s10, -1, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s6, s8
-; GCN-NEXT:    s_cselect_b32 s8, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s7, s9
-; GCN-NEXT:    s_cselect_b32 s8, s8, s10
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    s_cselect_b32 s4, s4, s7
-; GCN-NEXT:    s_cselect_b32 s5, s5, s6
+; GCN-NEXT:    s_cselect_b32 s7, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s4, s9
+; GCN-NEXT:    s_cselect_b32 s5, s7, s5
+; GCN-NEXT:    s_cmp_lg_u32 s5, 0
+; GCN-NEXT:    s_cselect_b32 s4, s10, s4
+; GCN-NEXT:    s_cselect_b32 s5, s11, s6
 ; GCN-NEXT:    v_mov_b32_e32 v0, s5
 ; GCN-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -187,7 +182,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    s_add_u32 s14, s12, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s8, s8, s9
-; GCN-IR-NEXT:    s_cmp_lg_u32 s8, 0
 ; GCN-IR-NEXT:    s_addc_u32 s8, s13, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s12, 63, s12
@@ -221,7 +215,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    s_add_u32 s16, s16, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s18, s18, s19
-; GCN-IR-NEXT:    s_cmp_lg_u32 s18, 0
 ; GCN-IR-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[4:5]
@@ -853,10 +846,9 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_addc_u32 s10, 0, s11
 ; GCN-NEXT:    s_add_u32 s11, s4, s5
 ; GCN-NEXT:    v_mov_b32_e32 v0, s11
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s6, v0
+; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
 ; GCN-NEXT:    s_addc_u32 s9, s9, s10
 ; GCN-NEXT:    s_mul_i32 s4, s6, s9
 ; GCN-NEXT:    v_readfirstlane_b32 s5, v0
@@ -887,7 +879,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_add_u32 s8, s11, s4
 ; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
 ; GCN-NEXT:    s_addc_u32 s4, s9, s6
 ; GCN-NEXT:    v_mul_hi_u32 v1, s8, 24
 ; GCN-NEXT:    v_mul_hi_u32 v0, s4, 24
@@ -903,46 +894,43 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    s_mul_i32 s0, s3, s8
 ; GCN-NEXT:    v_readfirstlane_b32 s1, v0
-; GCN-NEXT:    s_add_i32 s9, s1, s0
-; GCN-NEXT:    s_sub_i32 s10, 0, s9
+; GCN-NEXT:    s_add_i32 s10, s1, s0
+; GCN-NEXT:    s_sub_i32 s9, 0, s10
 ; GCN-NEXT:    s_mul_i32 s0, s2, s8
-; GCN-NEXT:    s_sub_u32 s8, 24, s0
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_or_b32 s11, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s11, 0
-; GCN-NEXT:    s_subb_u32 s10, s10, s3
-; GCN-NEXT:    s_sub_u32 s12, s8, s2
+; GCN-NEXT:    s_sub_u32 s11, 24, s0
 ; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT:    s_or_b32 s8, s0, s1
+; GCN-NEXT:    s_subb_u32 s12, s9, s3
+; GCN-NEXT:    s_sub_u32 s13, s11, s2
+; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT:    s_or_b32 s14, s8, s9
+; GCN-NEXT:    s_subb_u32 s14, s12, 0
+; GCN-NEXT:    s_cmp_ge_u32 s14, s3
+; GCN-NEXT:    s_cselect_b32 s15, -1, 0
+; GCN-NEXT:    s_cmp_ge_u32 s13, s2
+; GCN-NEXT:    s_cselect_b32 s16, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s14, s3
+; GCN-NEXT:    s_cselect_b32 s15, s16, s15
+; GCN-NEXT:    s_or_b32 s8, s8, s9
+; GCN-NEXT:    s_subb_u32 s12, s12, s3
+; GCN-NEXT:    s_sub_u32 s16, s13, s2
+; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT:    s_or_b32 s8, s8, s9
+; GCN-NEXT:    s_subb_u32 s8, s12, 0
+; GCN-NEXT:    s_cmp_lg_u32 s15, 0
+; GCN-NEXT:    s_cselect_b32 s9, s16, s13
+; GCN-NEXT:    s_cselect_b32 s8, s8, s14
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_subb_u32 s13, s10, 0
-; GCN-NEXT:    s_cmp_ge_u32 s13, s3
+; GCN-NEXT:    s_subb_u32 s0, 0, s10
+; GCN-NEXT:    s_cmp_ge_u32 s0, s3
 ; GCN-NEXT:    s_cselect_b32 s1, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s12, s2
-; GCN-NEXT:    s_cselect_b32 s14, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s13, s3
-; GCN-NEXT:    s_cselect_b32 s14, s14, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_subb_u32 s10, s10, s3
-; GCN-NEXT:    s_sub_u32 s15, s12, s2
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_subb_u32 s0, s10, 0
-; GCN-NEXT:    s_cmp_lg_u32 s14, 0
-; GCN-NEXT:    s_cselect_b32 s1, s15, s12
-; GCN-NEXT:    s_cselect_b32 s0, s0, s13
-; GCN-NEXT:    s_cmp_lg_u32 s11, 0
-; GCN-NEXT:    s_subb_u32 s9, 0, s9
-; GCN-NEXT:    s_cmp_ge_u32 s9, s3
-; GCN-NEXT:    s_cselect_b32 s10, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s8, s2
+; GCN-NEXT:    s_cmp_ge_u32 s11, s2
 ; GCN-NEXT:    s_cselect_b32 s2, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s9, s3
-; GCN-NEXT:    s_cselect_b32 s2, s2, s10
-; GCN-NEXT:    s_cmp_lg_u32 s2, 0
-; GCN-NEXT:    s_cselect_b32 s0, s0, s9
-; GCN-NEXT:    s_cselect_b32 s1, s1, s8
+; GCN-NEXT:    s_cmp_eq_u32 s0, s3
+; GCN-NEXT:    s_cselect_b32 s1, s2, s1
+; GCN-NEXT:    s_cmp_lg_u32 s1, 0
+; GCN-NEXT:    s_cselect_b32 s0, s8, s0
+; GCN-NEXT:    s_cselect_b32 s1, s9, s11
 ; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -970,7 +958,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s10, s8, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s6, s6, s7
-; GCN-IR-NEXT:    s_cmp_lg_u32 s6, 0
 ; GCN-IR-NEXT:    s_addc_u32 s6, s9, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
@@ -1003,7 +990,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s14, s14, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s16, s16, s17
-; GCN-IR-NEXT:    s_cmp_lg_u32 s16, 0
 ; GCN-IR-NEXT:    s_addc_u32 s15, s15, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], s[4:5]
@@ -1093,7 +1079,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s11, s8, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s6, s6, s7
-; GCN-IR-NEXT:    s_cmp_lg_u32 s6, 0
 ; GCN-IR-NEXT:    s_addc_u32 s6, s9, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
@@ -1123,7 +1108,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s12, s12, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[14:15], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s14, s14, s15
-; GCN-IR-NEXT:    s_cmp_lg_u32 s14, 0
 ; GCN-IR-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[14:15], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index d67a7b1..e8db647 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -18,7 +18,6 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    s_or_b32 s0, s0, s1
-; SI-NEXT:    s_cmp_lg_u32 s0, 0
 ; SI-NEXT:    s_subb_u32 s3, s3, s9
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -35,10 +34,8 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    s_sub_u32 s2, s2, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; VI-NEXT:    s_subb_u32 s3, s3, s5
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
@@ -53,14 +50,12 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sub_u32 s6, s2, s6
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_subb_u32 s4, s3, s7
+; GFX9-NEXT:    s_sub_u32 s4, s2, s6
+; GFX9-NEXT:    s_subb_u32 s5, s3, s7
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
@@ -73,8 +68,6 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_sub_u32 s2, s2, s6
-; GFX10-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX10-NEXT:    s_subb_u32 s3, s3, s7
 ; GFX10-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
@@ -91,14 +84,12 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_sub_u32 s2, s2, s4
-; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX11-NEXT:    s_subb_u32 s3, s3, s5
 ; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_u32 v0, s2, s2, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s3, 0, s2
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
@@ -443,7 +434,6 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_sub_u32 s4, s4, s6
 ; SI-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; SI-NEXT:    s_or_b32 s6, s12, s13
-; SI-NEXT:    s_cmp_lg_u32 s6, 0
 ; SI-NEXT:    s_subb_u32 s5, s5, s7
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
@@ -464,16 +454,14 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    s_sub_u32 s2, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_sub_u32 s0, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_subb_u32 s1, s5, s7
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; VI-NEXT:    s_subb_u32 s0, s5, s7
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    v_mov_b32_e32 v5, s0
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -485,12 +473,10 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sub_u32 s2, s12, s14
-; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    s_subb_u32 s0, s13, s15
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_sub_u32 s0, s12, s14
+; GFX9-NEXT:    s_subb_u32 s1, s13, s15
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -503,10 +489,8 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_sub_u32 s0, s12, s14
-; GFX10-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX10-NEXT:    s_subb_u32 s1, s13, s15
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
@@ -519,10 +503,8 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_sub_u32 s4, s4, s6
-; GFX11-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s4
-; GFX11-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX11-NEXT:    s_subb_u32 s5, s5, s7
+; GFX11-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 75db387..28c6b40 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -774,44 +774,40 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1032-NEXT:    s_add_u32 s11, s12, s11
 ; GFX1032-NEXT:    s_addc_u32 s12, 0, s13
 ; GFX1032-NEXT:    s_add_u32 s8, s8, s11
-; GFX1032-NEXT:    s_cselect_b32 s11, -1, 0
-; GFX1032-NEXT:    s_mul_hi_u32 s13, s9, s8
-; GFX1032-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX1032-NEXT:    s_mul_i32 s11, s9, s8
 ; GFX1032-NEXT:    s_addc_u32 s5, s5, s12
-; GFX1032-NEXT:    s_mul_i32 s10, s10, s8
+; GFX1032-NEXT:    s_mul_hi_u32 s11, s9, s8
+; GFX1032-NEXT:    s_mul_i32 s12, s9, s8
 ; GFX1032-NEXT:    s_mul_i32 s9, s9, s5
-; GFX1032-NEXT:    s_mul_hi_u32 s12, s8, s11
-; GFX1032-NEXT:    s_add_i32 s9, s13, s9
-; GFX1032-NEXT:    s_mul_hi_u32 s13, s5, s11
+; GFX1032-NEXT:    s_mul_i32 s10, s10, s8
+; GFX1032-NEXT:    s_add_i32 s9, s11, s9
+; GFX1032-NEXT:    s_mul_i32 s11, s5, s12
 ; GFX1032-NEXT:    s_add_i32 s9, s9, s10
-; GFX1032-NEXT:    s_mul_i32 s10, s5, s11
+; GFX1032-NEXT:    s_mul_hi_u32 s10, s8, s12
 ; GFX1032-NEXT:    s_mul_i32 s15, s8, s9
 ; GFX1032-NEXT:    s_mul_hi_u32 s14, s8, s9
-; GFX1032-NEXT:    s_add_u32 s12, s12, s15
+; GFX1032-NEXT:    s_add_u32 s10, s10, s15
+; GFX1032-NEXT:    s_mul_hi_u32 s13, s5, s12
 ; GFX1032-NEXT:    s_addc_u32 s14, 0, s14
-; GFX1032-NEXT:    s_mul_hi_u32 s11, s5, s9
-; GFX1032-NEXT:    s_add_u32 s10, s12, s10
+; GFX1032-NEXT:    s_mul_hi_u32 s12, s5, s9
+; GFX1032-NEXT:    s_add_u32 s10, s10, s11
 ; GFX1032-NEXT:    s_mul_i32 s9, s5, s9
 ; GFX1032-NEXT:    s_addc_u32 s10, s14, s13
-; GFX1032-NEXT:    s_addc_u32 s11, s11, 0
+; GFX1032-NEXT:    s_addc_u32 s11, s12, 0
 ; GFX1032-NEXT:    s_add_u32 s9, s10, s9
 ; GFX1032-NEXT:    s_addc_u32 s10, 0, s11
 ; GFX1032-NEXT:    s_add_u32 s8, s8, s9
-; GFX1032-NEXT:    s_cselect_b32 s9, -1, 0
-; GFX1032-NEXT:    s_mul_hi_u32 s11, s2, s8
-; GFX1032-NEXT:    s_cmp_lg_u32 s9, 0
-; GFX1032-NEXT:    s_mul_hi_u32 s9, s3, s8
 ; GFX1032-NEXT:    s_addc_u32 s5, s5, s10
-; GFX1032-NEXT:    s_mul_i32 s8, s3, s8
+; GFX1032-NEXT:    s_mul_hi_u32 s9, s2, s8
 ; GFX1032-NEXT:    s_mul_i32 s12, s2, s5
-; GFX1032-NEXT:    s_mul_hi_u32 s10, s2, s5
-; GFX1032-NEXT:    s_add_u32 s11, s11, s12
-; GFX1032-NEXT:    s_addc_u32 s10, 0, s10
+; GFX1032-NEXT:    s_mul_hi_u32 s11, s2, s5
+; GFX1032-NEXT:    s_mul_hi_u32 s10, s3, s8
+; GFX1032-NEXT:    s_mul_i32 s8, s3, s8
+; GFX1032-NEXT:    s_add_u32 s9, s9, s12
+; GFX1032-NEXT:    s_addc_u32 s11, 0, s11
 ; GFX1032-NEXT:    s_mul_hi_u32 s13, s3, s5
-; GFX1032-NEXT:    s_add_u32 s8, s11, s8
+; GFX1032-NEXT:    s_add_u32 s8, s9, s8
 ; GFX1032-NEXT:    s_mul_i32 s5, s3, s5
-; GFX1032-NEXT:    s_addc_u32 s8, s10, s9
+; GFX1032-NEXT:    s_addc_u32 s8, s11, s10
 ; GFX1032-NEXT:    s_addc_u32 s9, s13, 0
 ; GFX1032-NEXT:    s_add_u32 s5, s8, s5
 ; GFX1032-NEXT:    s_addc_u32 s8, 0, s9
@@ -824,11 +820,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1032-NEXT:    s_sub_i32 s11, s3, s9
 ; GFX1032-NEXT:    s_sub_u32 s10, s2, s10
 ; GFX1032-NEXT:    s_cselect_b32 s12, -1, 0
-; GFX1032-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX1032-NEXT:    s_subb_u32 s11, s11, s1
 ; GFX1032-NEXT:    s_sub_u32 s13, s10, s0
-; GFX1032-NEXT:    s_cselect_b32 s14, -1, 0
-; GFX1032-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX1032-NEXT:    s_subb_u32 s11, s11, 0
 ; GFX1032-NEXT:    s_cmp_ge_u32 s11, s1
 ; GFX1032-NEXT:    s_cselect_b32 s14, -1, 0
@@ -901,8 +894,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    v_cvt_f32_u32_e32 v0, s0
 ; GFX1064-NEXT:    v_cvt_f32_u32_e32 v1, s1
-; GFX1064-NEXT:    s_sub_u32 s9, 0, s0
-; GFX1064-NEXT:    s_subb_u32 s10, 0, s1
+; GFX1064-NEXT:    s_sub_u32 s8, 0, s0
+; GFX1064-NEXT:    s_subb_u32 s9, 0, s1
 ; GFX1064-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GFX1064-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX1064-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -911,109 +904,102 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1064-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GFX1064-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX1064-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX1064-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX1064-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX1064-NEXT:    s_mul_i32 s5, s9, s8
-; GFX1064-NEXT:    s_mul_hi_u32 s12, s9, s4
-; GFX1064-NEXT:    s_mul_i32 s11, s10, s4
-; GFX1064-NEXT:    s_add_i32 s5, s12, s5
-; GFX1064-NEXT:    s_mul_i32 s13, s9, s4
-; GFX1064-NEXT:    s_add_i32 s5, s5, s11
-; GFX1064-NEXT:    s_mul_hi_u32 s12, s4, s13
-; GFX1064-NEXT:    s_mul_i32 s15, s4, s5
-; GFX1064-NEXT:    s_mul_hi_u32 s14, s8, s13
-; GFX1064-NEXT:    s_mul_i32 s11, s8, s13
-; GFX1064-NEXT:    s_mul_hi_u32 s13, s4, s5
+; GFX1064-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX1064-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX1064-NEXT:    s_mul_i32 s10, s8, s4
+; GFX1064-NEXT:    s_mul_hi_u32 s12, s8, s5
+; GFX1064-NEXT:    s_mul_i32 s11, s9, s5
+; GFX1064-NEXT:    s_add_i32 s10, s12, s10
+; GFX1064-NEXT:    s_mul_i32 s13, s8, s5
+; GFX1064-NEXT:    s_add_i32 s10, s10, s11
+; GFX1064-NEXT:    s_mul_hi_u32 s12, s5, s13
+; GFX1064-NEXT:    s_mul_i32 s15, s5, s10
+; GFX1064-NEXT:    s_mul_hi_u32 s14, s4, s13
+; GFX1064-NEXT:    s_mul_i32 s11, s4, s13
+; GFX1064-NEXT:    s_mul_hi_u32 s13, s5, s10
 ; GFX1064-NEXT:    s_add_u32 s12, s12, s15
 ; GFX1064-NEXT:    s_addc_u32 s13, 0, s13
-; GFX1064-NEXT:    s_mul_hi_u32 s16, s8, s5
+; GFX1064-NEXT:    s_mul_hi_u32 s16, s4, s10
 ; GFX1064-NEXT:    s_add_u32 s11, s12, s11
-; GFX1064-NEXT:    s_mul_i32 s5, s8, s5
+; GFX1064-NEXT:    s_mul_i32 s10, s4, s10
 ; GFX1064-NEXT:    s_addc_u32 s11, s13, s14
 ; GFX1064-NEXT:    s_addc_u32 s12, s16, 0
-; GFX1064-NEXT:    s_add_u32 s5, s11, s5
+; GFX1064-NEXT:    s_add_u32 s10, s11, s10
 ; GFX1064-NEXT:    s_addc_u32 s11, 0, s12
-; GFX1064-NEXT:    s_add_u32 s12, s4, s5
-; GFX1064-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX1064-NEXT:    s_mul_hi_u32 s13, s9, s12
-; GFX1064-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX1064-NEXT:    s_mul_i32 s4, s9, s12
-; GFX1064-NEXT:    s_addc_u32 s8, s8, s11
-; GFX1064-NEXT:    s_mul_i32 s10, s10, s12
-; GFX1064-NEXT:    s_mul_i32 s9, s9, s8
-; GFX1064-NEXT:    s_mul_hi_u32 s5, s12, s4
-; GFX1064-NEXT:    s_add_i32 s9, s13, s9
-; GFX1064-NEXT:    s_mul_hi_u32 s11, s8, s4
-; GFX1064-NEXT:    s_add_i32 s9, s9, s10
-; GFX1064-NEXT:    s_mul_i32 s4, s8, s4
-; GFX1064-NEXT:    s_mul_i32 s14, s12, s9
-; GFX1064-NEXT:    s_mul_hi_u32 s13, s12, s9
-; GFX1064-NEXT:    s_add_u32 s5, s5, s14
+; GFX1064-NEXT:    s_add_u32 s5, s5, s10
+; GFX1064-NEXT:    s_addc_u32 s4, s4, s11
+; GFX1064-NEXT:    s_mul_hi_u32 s10, s8, s5
+; GFX1064-NEXT:    s_mul_i32 s11, s8, s5
+; GFX1064-NEXT:    s_mul_i32 s8, s8, s4
+; GFX1064-NEXT:    s_mul_i32 s9, s9, s5
+; GFX1064-NEXT:    s_add_i32 s8, s10, s8
+; GFX1064-NEXT:    s_mul_i32 s10, s4, s11
+; GFX1064-NEXT:    s_add_i32 s8, s8, s9
+; GFX1064-NEXT:    s_mul_hi_u32 s9, s5, s11
+; GFX1064-NEXT:    s_mul_i32 s14, s5, s8
+; GFX1064-NEXT:    s_mul_hi_u32 s13, s5, s8
+; GFX1064-NEXT:    s_add_u32 s9, s9, s14
+; GFX1064-NEXT:    s_mul_hi_u32 s12, s4, s11
 ; GFX1064-NEXT:    s_addc_u32 s13, 0, s13
-; GFX1064-NEXT:    s_mul_hi_u32 s10, s8, s9
-; GFX1064-NEXT:    s_add_u32 s4, s5, s4
-; GFX1064-NEXT:    s_mul_i32 s9, s8, s9
-; GFX1064-NEXT:    s_addc_u32 s4, s13, s11
-; GFX1064-NEXT:    s_addc_u32 s5, s10, 0
-; GFX1064-NEXT:    s_add_u32 s4, s4, s9
-; GFX1064-NEXT:    s_addc_u32 s9, 0, s5
-; GFX1064-NEXT:    s_add_u32 s10, s12, s4
-; GFX1064-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX1064-NEXT:    s_mul_hi_u32 s11, s2, s10
-; GFX1064-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX1064-NEXT:    s_mul_hi_u32 s4, s3, s10
-; GFX1064-NEXT:    s_addc_u32 s5, s8, s9
-; GFX1064-NEXT:    s_mul_i32 s8, s3, s10
-; GFX1064-NEXT:    s_mul_i32 s10, s2, s5
-; GFX1064-NEXT:    s_mul_hi_u32 s9, s2, s5
-; GFX1064-NEXT:    s_add_u32 s10, s11, s10
-; GFX1064-NEXT:    s_addc_u32 s9, 0, s9
-; GFX1064-NEXT:    s_mul_hi_u32 s12, s3, s5
-; GFX1064-NEXT:    s_add_u32 s8, s10, s8
+; GFX1064-NEXT:    s_mul_hi_u32 s11, s4, s8
+; GFX1064-NEXT:    s_add_u32 s9, s9, s10
+; GFX1064-NEXT:    s_mul_i32 s8, s4, s8
+; GFX1064-NEXT:    s_addc_u32 s9, s13, s12
+; GFX1064-NEXT:    s_addc_u32 s10, s11, 0
+; GFX1064-NEXT:    s_add_u32 s8, s9, s8
+; GFX1064-NEXT:    s_addc_u32 s9, 0, s10
+; GFX1064-NEXT:    s_add_u32 s5, s5, s8
+; GFX1064-NEXT:    s_addc_u32 s4, s4, s9
+; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s5
+; GFX1064-NEXT:    s_mul_i32 s11, s2, s4
+; GFX1064-NEXT:    s_mul_hi_u32 s10, s2, s4
+; GFX1064-NEXT:    s_mul_hi_u32 s9, s3, s5
 ; GFX1064-NEXT:    s_mul_i32 s5, s3, s5
-; GFX1064-NEXT:    s_addc_u32 s4, s9, s4
+; GFX1064-NEXT:    s_add_u32 s8, s8, s11
+; GFX1064-NEXT:    s_addc_u32 s10, 0, s10
+; GFX1064-NEXT:    s_mul_hi_u32 s12, s3, s4
+; GFX1064-NEXT:    s_add_u32 s5, s8, s5
+; GFX1064-NEXT:    s_mul_i32 s4, s3, s4
+; GFX1064-NEXT:    s_addc_u32 s5, s10, s9
 ; GFX1064-NEXT:    s_addc_u32 s8, s12, 0
-; GFX1064-NEXT:    s_add_u32 s10, s4, s5
+; GFX1064-NEXT:    s_add_u32 s10, s5, s4
 ; GFX1064-NEXT:    s_addc_u32 s11, 0, s8
 ; GFX1064-NEXT:    s_mul_hi_u32 s4, s0, s10
 ; GFX1064-NEXT:    s_mul_i32 s5, s0, s11
 ; GFX1064-NEXT:    s_mul_i32 s8, s1, s10
 ; GFX1064-NEXT:    s_add_i32 s4, s4, s5
-; GFX1064-NEXT:    s_add_i32 s12, s4, s8
+; GFX1064-NEXT:    s_add_i32 s8, s4, s8
 ; GFX1064-NEXT:    s_mul_i32 s4, s0, s10
-; GFX1064-NEXT:    s_sub_i32 s8, s3, s12
-; GFX1064-NEXT:    s_sub_u32 s13, s2, s4
+; GFX1064-NEXT:    s_sub_i32 s9, s3, s8
+; GFX1064-NEXT:    s_sub_u32 s12, s2, s4
 ; GFX1064-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX1064-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX1064-NEXT:    s_subb_u32 s14, s8, s1
-; GFX1064-NEXT:    s_sub_u32 s15, s13, s0
-; GFX1064-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX1064-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; GFX1064-NEXT:    s_subb_u32 s8, s14, 0
-; GFX1064-NEXT:    s_cmp_ge_u32 s8, s1
-; GFX1064-NEXT:    s_cselect_b32 s9, -1, 0
-; GFX1064-NEXT:    s_cmp_ge_u32 s15, s0
+; GFX1064-NEXT:    s_subb_u32 s9, s9, s1
+; GFX1064-NEXT:    s_sub_u32 s13, s12, s0
+; GFX1064-NEXT:    s_subb_u32 s9, s9, 0
+; GFX1064-NEXT:    s_cmp_ge_u32 s9, s1
 ; GFX1064-NEXT:    s_cselect_b32 s14, -1, 0
-; GFX1064-NEXT:    s_cmp_eq_u32 s8, s1
-; GFX1064-NEXT:    s_cselect_b32 s8, s14, s9
-; GFX1064-NEXT:    s_add_u32 s9, s10, 1
+; GFX1064-NEXT:    s_cmp_ge_u32 s13, s0
+; GFX1064-NEXT:    s_cselect_b32 s13, -1, 0
+; GFX1064-NEXT:    s_cmp_eq_u32 s9, s1
+; GFX1064-NEXT:    s_cselect_b32 s9, s13, s14
+; GFX1064-NEXT:    s_add_u32 s13, s10, 1
 ; GFX1064-NEXT:    s_addc_u32 s14, s11, 0
 ; GFX1064-NEXT:    s_add_u32 s15, s10, 2
 ; GFX1064-NEXT:    s_addc_u32 s16, s11, 0
-; GFX1064-NEXT:    s_cmp_lg_u32 s8, 0
-; GFX1064-NEXT:    s_cselect_b32 s15, s15, s9
+; GFX1064-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX1064-NEXT:    s_cselect_b32 s13, s15, s13
 ; GFX1064-NEXT:    s_cselect_b32 s14, s16, s14
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX1064-NEXT:    s_subb_u32 s3, s3, s12
+; GFX1064-NEXT:    s_subb_u32 s3, s3, s8
 ; GFX1064-NEXT:    s_cmp_ge_u32 s3, s1
 ; GFX1064-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX1064-NEXT:    s_cmp_ge_u32 s13, s0
+; GFX1064-NEXT:    s_cmp_ge_u32 s12, s0
 ; GFX1064-NEXT:    s_cselect_b32 s5, -1, 0
 ; GFX1064-NEXT:    s_cmp_eq_u32 s3, s1
 ; GFX1064-NEXT:    s_cselect_b32 s1, s5, s4
 ; GFX1064-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1064-NEXT:    s_cselect_b32 s5, s14, s11
-; GFX1064-NEXT:    s_cselect_b32 s4, s15, s10
+; GFX1064-NEXT:    s_cselect_b32 s4, s13, s10
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX1064-NEXT:  .LBB15_2:
 ; GFX1064-NEXT:    v_cvt_f32_u32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
index 64d055b..4445383 100644
--- a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
+++ b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
@@ -271,7 +271,6 @@ define i1 @workgroup_nonzero() {
 ; DAGISEL-GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX8-NEXT:    s_or_b32 s4, s12, s13
 ; DAGISEL-GFX8-NEXT:    s_or_b32 s4, s4, s14
-; DAGISEL-GFX8-NEXT:    s_cmp_lg_u32 s4, 0
 ; DAGISEL-GFX8-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; DAGISEL-GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; DAGISEL-GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -281,7 +280,6 @@ define i1 @workgroup_nonzero() {
 ; DAGISEL-GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX942-NEXT:    s_or_b32 s0, s12, s13
 ; DAGISEL-GFX942-NEXT:    s_or_b32 s0, s0, s14
-; DAGISEL-GFX942-NEXT:    s_cmp_lg_u32 s0, 0
 ; DAGISEL-GFX942-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; DAGISEL-GFX942-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; DAGISEL-GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -299,8 +297,6 @@ define i1 @workgroup_nonzero() {
 ; DAGISEL-GFX12-NEXT:    s_or_b32 s0, ttmp9, s0
 ; DAGISEL-GFX12-NEXT:    s_wait_alu 0xfffe
 ; DAGISEL-GFX12-NEXT:    s_or_b32 s0, s0, s1
-; DAGISEL-GFX12-NEXT:    s_wait_alu 0xfffe
-; DAGISEL-GFX12-NEXT:    s_cmp_lg_u32 s0, 0
 ; DAGISEL-GFX12-NEXT:    s_cselect_b32 s0, -1, 0
 ; DAGISEL-GFX12-NEXT:    s_wait_alu 0xfffe
 ; DAGISEL-GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
@@ -311,7 +307,6 @@ define i1 @workgroup_nonzero() {
 ; GISEL-GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX8-NEXT:    s_or_b32 s4, s12, s13
 ; GISEL-GFX8-NEXT:    s_or_b32 s4, s4, s14
-; GISEL-GFX8-NEXT:    s_cmp_lg_u32 s4, 0
 ; GISEL-GFX8-NEXT:    s_cselect_b32 s4, 1, 0
 ; GISEL-GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GISEL-GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -321,7 +316,6 @@ define i1 @workgroup_nonzero() {
 ; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX942-NEXT:    s_or_b32 s0, s12, s13
 ; GISEL-GFX942-NEXT:    s_or_b32 s0, s0, s14
-; GISEL-GFX942-NEXT:    s_cmp_lg_u32 s0, 0
 ; GISEL-GFX942-NEXT:    s_cselect_b32 s0, 1, 0
 ; GISEL-GFX942-NEXT:    v_mov_b32_e32 v0, s0
 ; GISEL-GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -339,8 +333,6 @@ define i1 @workgroup_nonzero() {
 ; GISEL-GFX12-NEXT:    s_or_b32 s0, ttmp9, s0
 ; GISEL-GFX12-NEXT:    s_wait_alu 0xfffe
 ; GISEL-GFX12-NEXT:    s_or_b32 s0, s0, s1
-; GISEL-GFX12-NEXT:    s_wait_alu 0xfffe
-; GISEL-GFX12-NEXT:    s_cmp_lg_u32 s0, 0
 ; GISEL-GFX12-NEXT:    s_cselect_b32 s0, 1, 0
 ; GISEL-GFX12-NEXT:    s_wait_alu 0xfffe
 ; GISEL-GFX12-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/ARM/2014-05-14-DwarfEHCrash.ll b/llvm/test/CodeGen/ARM/2014-05-14-DwarfEHCrash.ll
index 96639ed..bfaf799 100644
--- a/llvm/test/CodeGen/ARM/2014-05-14-DwarfEHCrash.ll
+++ b/llvm/test/CodeGen/ARM/2014-05-14-DwarfEHCrash.ll
@@ -45,6 +45,6 @@ declare ptr @__cxa_begin_catch(ptr)
 
 declare void @__cxa_end_catch()
 
-attributes #0 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/ARM/ARMLoadStoreDBG.mir b/llvm/test/CodeGen/ARM/ARMLoadStoreDBG.mir
index 812ac23..a18023c 100644
--- a/llvm/test/CodeGen/ARM/ARMLoadStoreDBG.mir
+++ b/llvm/test/CodeGen/ARM/ARMLoadStoreDBG.mir
@@ -31,8 +31,8 @@
   ; Function Attrs: nounwind readnone
   declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
 
-  attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-  attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+  attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
   attributes #2 = { nounwind readnone }
   attributes #3 = { nounwind }
 
diff --git a/llvm/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll b/llvm/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll
index 25119fe..7df230c 100644
--- a/llvm/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll
+++ b/llvm/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll
@@ -441,8 +441,7 @@ entry:
 define i32 @test_shufflevector_s32_v2s32(i32 %arg) {
 ; CHECK-LABEL: name: test_shufflevector_s32_v2s32
 ; CHECK: [[ARG:%[0-9]+]]:_(s32) = COPY $r0
-; CHECK-DAG: [[UNDEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_SHUFFLE_VECTOR [[ARG]](s32), [[UNDEF]], shufflemask(0, 0)
+; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ARG]](s32), [[ARG]](s32)
 ; CHECK: G_EXTRACT_VECTOR_ELT [[VEC]](<2 x s32>)
   %vec = insertelement <1 x i32> undef, i32 %arg, i32 0
   %shuffle = shufflevector <1 x i32> %vec, <1 x i32> undef, <2 x i32> zeroinitializer
@@ -453,8 +452,7 @@ define i32 @test_shufflevector_s32_v2s32(i32 %arg) {
 define i32 @test_shufflevector_s32_s32_s32(i32 %arg) {
 ; CHECK-LABEL: name: test_shufflevector_s32_s32_s32
 ; CHECK: [[ARG:%[0-9]+]]:_(s32) = COPY $r0
-; CHECK-DAG: [[UNDEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-; CHECK: [[VEC:%[0-9]+]]:_(s32) = G_SHUFFLE_VECTOR [[ARG]](s32), [[UNDEF]], shufflemask(0)
+; CHECK: r0 = COPY [[ARG]](s32)
   %vec = insertelement <1 x i32> undef, i32 %arg, i32 0
   %shuffle = shufflevector <1 x i32> %vec, <1 x i32> undef, <1 x i32> zeroinitializer
   %res = extractelement <1 x i32> %shuffle, i32 0
diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll
index 9601a2e..2731148 100644
--- a/llvm/test/CodeGen/ARM/O3-pipeline.ll
+++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll
@@ -166,6 +166,7 @@
 ; CHECK-NEXT:      ARM Execution Domain Fix
 ; CHECK-NEXT:      BreakFalseDeps
 ; CHECK-NEXT:      ARM pseudo instruction expansion pass
+; CHECK-NEXT:      Insert KCFI indirect call checks
 ; CHECK-NEXT:      Thumb2 instruction size reduce pass
 ; CHECK-NEXT:      MachineDominator Tree Construction
 ; CHECK-NEXT:      Machine Natural Loop Construction
diff --git a/llvm/test/CodeGen/ARM/Windows/wineh-basic.ll b/llvm/test/CodeGen/ARM/Windows/wineh-basic.ll
index d0bdd66..e4dc92d 100644
--- a/llvm/test/CodeGen/ARM/Windows/wineh-basic.ll
+++ b/llvm/test/CodeGen/ARM/Windows/wineh-basic.ll
@@ -36,8 +36,8 @@ declare arm_aapcs_vfpcc i32 @__CxxFrameHandler3(...)
 
 declare arm_aapcs_vfpcc void @__std_terminate() local_unnamed_addr
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a9" "target-features"="+dsp,+fp16,+neon,+strict-align,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a9" "target-features"="+dsp,+fp16,+neon,+strict-align,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a9" "target-features"="+dsp,+fp16,+neon,+strict-align,+vfp3" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a9" "target-features"="+dsp,+fp16,+neon,+strict-align,+vfp3" "use-soft-float"="false" }
 attributes #2 = { noreturn nounwind }
 
 !llvm.module.flags = !{!0, !1}
diff --git a/llvm/test/CodeGen/ARM/byval_load_align.ll b/llvm/test/CodeGen/ARM/byval_load_align.ll
index c594bd3..5bb4fe7 100644
--- a/llvm/test/CodeGen/ARM/byval_load_align.ll
+++ b/llvm/test/CodeGen/ARM/byval_load_align.ll
@@ -22,6 +22,6 @@ entry:
 
 declare void @Logger(i8 signext, ptr byval(%struct.ModuleID)) #1
 
-attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll b/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll
index 972a470..cabd43e 100644
--- a/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll
+++ b/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll
@@ -27,7 +27,7 @@ entry:
 !1 = !{i64 0, !"_ZTSFivE.generalized"}
 !2 = !{i64 0, !"_ZTSFviE.generalized"}
 
-; CHECK: .section .llvm.callgraph,"o",%progbits,.text
+; CHECK: .section .llvm.callgraph,"o",%llvm_call_graph,.text
 ;; Version
 ; CHECK-NEXT: .byte   0
 ;; Flags -- Potential indirect target so LSB is set to 1. Other bits are 0.
diff --git a/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll b/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll
index ec8d5b8..3d3974e 100644
--- a/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll
+++ b/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll
@@ -36,7 +36,7 @@ entry:
 !4 = !{!5}
 !5 = !{i64 0, !"_ZTSFPvS_E.generalized"}
 
-; CHECK: .section .llvm.callgraph,"o",%progbits,.text
+; CHECK: .section .llvm.callgraph,"o",%llvm_call_graph,.text
 ;; Version
 ; CHECK-NEXT: .byte   0
 ;; Flags
diff --git a/llvm/test/CodeGen/ARM/cfguard-module-flag.ll b/llvm/test/CodeGen/ARM/cfguard-module-flag.ll
index 3e8c9f4..bb3c04a 100644
--- a/llvm/test/CodeGen/ARM/cfguard-module-flag.ll
+++ b/llvm/test/CodeGen/ARM/cfguard-module-flag.ll
@@ -21,7 +21,7 @@ entry:
   ; CHECK-NOT: __guard_check_icall_fptr
   ; CHECK-NOT: __guard_dispatch_icall_fptr
 }
-attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a9" "target-features"="+armv7-a,+dsp,+fp16,+neon,+strict-align,+thumb-mode,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false"}
+attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a9" "target-features"="+armv7-a,+dsp,+fp16,+neon,+strict-align,+thumb-mode,+vfp3" "use-soft-float"="false"}
 
 !llvm.module.flags = !{!0}
 !0 = !{i32 2, !"cfguard", i32 1}
diff --git a/llvm/test/CodeGen/ARM/clang-section.ll b/llvm/test/CodeGen/ARM/clang-section.ll
index 9277d90..9c32ab2 100644
--- a/llvm/test/CodeGen/ARM/clang-section.ll
+++ b/llvm/test/CodeGen/ARM/clang-section.ll
@@ -35,8 +35,8 @@ attributes #0 = { "bss-section"="my_bss.1" "data-section"="my_data.1" "rodata-se
 attributes #1 = { "data-section"="my_data.1" "rodata-section"="my_rodata.1" }
 attributes #2 = { "bss-section"="my_bss.2" "rodata-section"="my_rodata.1" }
 attributes #3 = { "bss-section"="my_bss.2" "data-section"="my_data.2" "rodata-section"="my_rodata.2" }
-attributes #6 = { "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a9" "target-features"="+dsp,+fp16,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #7 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a9" "target-features"="+dsp,+fp16,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a9" "target-features"="+dsp,+fp16,+neon,+vfp3" "use-soft-float"="false" }
+attributes #7 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a9" "target-features"="+dsp,+fp16,+neon,+vfp3" "use-soft-float"="false" }
 
 !llvm.module.flags = !{!0, !1, !2, !3}
 
diff --git a/llvm/test/CodeGen/ARM/cmse-clear-float-bigend.mir b/llvm/test/CodeGen/ARM/cmse-clear-float-bigend.mir
index 47f4e1a..ae36da4 100644
--- a/llvm/test/CodeGen/ARM/cmse-clear-float-bigend.mir
+++ b/llvm/test/CodeGen/ARM/cmse-clear-float-bigend.mir
@@ -16,7 +16,7 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #1
 
-  attributes #0 = { "cmse_nonsecure_entry" nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+8msecext,+armv8-m.main,-d32,-fp64,+fp-armv8,+hwdiv,+thumb-mode,-crypto,-fullfp16,-neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { "cmse_nonsecure_entry" nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+8msecext,+armv8-m.main,-d32,-fp64,+fp-armv8,+hwdiv,+thumb-mode,-crypto,-fullfp16,-neon" "use-soft-float"="false" }
   attributes #1 = { nounwind }
   attributes #2 = { "cmse_nonsecure_call" nounwind }
 
diff --git a/llvm/test/CodeGen/ARM/coalesce-dbgvalue.ll b/llvm/test/CodeGen/ARM/coalesce-dbgvalue.ll
index 4d4853c..7960b79 100644
--- a/llvm/test/CodeGen/ARM/coalesce-dbgvalue.ll
+++ b/llvm/test/CodeGen/ARM/coalesce-dbgvalue.ll
@@ -72,8 +72,8 @@ declare i32 @fn3(...) #1
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.value(metadata, metadata, metadata) #2
 
-attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
 attributes #3 = { nounwind }
 
diff --git a/llvm/test/CodeGen/ARM/constantpool-promote-dbg.ll b/llvm/test/CodeGen/ARM/constantpool-promote-dbg.ll
index 246eeeb..4bc6c41 100644
--- a/llvm/test/CodeGen/ARM/constantpool-promote-dbg.ll
+++ b/llvm/test/CodeGen/ARM/constantpool-promote-dbg.ll
@@ -19,7 +19,7 @@ entry:
   ret ptr getelementptr inbounds ([4 x i8], ptr @.str, i32 0, i32 1), !dbg !16
 }
 
-attributes #0 = { minsize norecurse nounwind optsize readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m3" "target-features"="+hwdiv,+soft-float,-crypto,-neon" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { minsize norecurse nounwind optsize readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m3" "target-features"="+hwdiv,+soft-float,-crypto,-neon" "use-soft-float"="true" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5, !6}
diff --git a/llvm/test/CodeGen/ARM/constantpool-promote.ll b/llvm/test/CodeGen/ARM/constantpool-promote.ll
index c383b39..87f14ebf 100644
--- a/llvm/test/CodeGen/ARM/constantpool-promote.ll
+++ b/llvm/test/CodeGen/ARM/constantpool-promote.ll
@@ -200,8 +200,8 @@ declare void @d(ptr) #1
 declare void @llvm.memcpy.p0.p0.i32(ptr nocapture writeonly, ptr nocapture readonly, i32, i1)
 declare void @llvm.memmove.p0.p0.i32(ptr, ptr, i32, i1) local_unnamed_addr
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #2 = { nounwind }
 
 !llvm.module.flags = !{!0, !1}
diff --git a/llvm/test/CodeGen/ARM/early-cfi-sections.ll b/llvm/test/CodeGen/ARM/early-cfi-sections.ll
index 72b8702..ef99ae5 100644
--- a/llvm/test/CodeGen/ARM/early-cfi-sections.ll
+++ b/llvm/test/CodeGen/ARM/early-cfi-sections.ll
@@ -13,7 +13,7 @@ entry:
   ret void, !dbg !10
 }
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="arm7tdmi" "target-features"="+soft-float,+strict-align,-crypto,-neon" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="arm7tdmi" "target-features"="+soft-float,+strict-align,-crypto,-neon" "use-soft-float"="true" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5, !6}
diff --git a/llvm/test/CodeGen/ARM/fp16-vld.ll b/llvm/test/CodeGen/ARM/fp16-vld.ll
index 549546e..778685c 100644
--- a/llvm/test/CodeGen/ARM/fp16-vld.ll
+++ b/llvm/test/CodeGen/ARM/fp16-vld.ll
@@ -43,4 +43,4 @@ byeblock:
   ret void
 }
 
-attributes #0 = { norecurse nounwind readonly "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "target-cpu"="generic" "target-features"="+armv8.2-a,+fullfp16,+strict-align,-thumb-mode" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind readonly "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "target-cpu"="generic" "target-features"="+armv8.2-a,+fullfp16,+strict-align,-thumb-mode" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/ARM/global-merge-1.ll b/llvm/test/CodeGen/ARM/global-merge-1.ll
index 46e9d96..05719ae 100644
--- a/llvm/test/CodeGen/ARM/global-merge-1.ll
+++ b/llvm/test/CodeGen/ARM/global-merge-1.ll
@@ -74,9 +74,9 @@ define internal ptr @returnFoo() #2 {
   ret ptr @foo
 }
 
-attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #3 = { nounwind }
 
 !llvm.ident = !{!0}
diff --git a/llvm/test/CodeGen/ARM/isel-v8i32-crash.ll b/llvm/test/CodeGen/ARM/isel-v8i32-crash.ll
index 27534a6..bdd842a 100644
--- a/llvm/test/CodeGen/ARM/isel-v8i32-crash.ll
+++ b/llvm/test/CodeGen/ARM/isel-v8i32-crash.ll
@@ -21,4 +21,4 @@ entry:
   ret void
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/ARM/kcfi-arm.ll b/llvm/test/CodeGen/ARM/kcfi-arm.ll
new file mode 100644
index 0000000..e3696cf
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/kcfi-arm.ll
@@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=armv7-linux-gnueabi -verify-machineinstrs < %s | FileCheck %s --check-prefix=ASM
+; RUN: llc -mtriple=armv7-linux-gnueabi -verify-machineinstrs -stop-after=finalize-isel < %s | FileCheck %s --check-prefixes=MIR,ISEL
+; RUN: llc -mtriple=armv7-linux-gnueabi -verify-machineinstrs -stop-after=kcfi < %s | FileCheck %s --check-prefixes=MIR,KCFI
+
+; MIR checks for all functions (grouped here to prevent update_llc_test_checks.py from removing them)
+
+; MIR-LABEL: name: f1
+; MIR: body:
+
+; ISEL:     BLX %0, csr_aapcs,{{.*}} cfi-type 12345678
+
+; KCFI:       BUNDLE{{.*}} {
+; KCFI-NEXT:    KCFI_CHECK_ARM $r0, 12345678
+; KCFI-NEXT:    BLX killed $r0, csr_aapcs,{{.*}}
+; KCFI-NEXT:  }
+
+; MIR-LABEL: name: f2
+; MIR: body:
+
+; ISEL:     TCRETURNri %0, 0, csr_aapcs, implicit $sp, cfi-type 12345678
+
+; KCFI:       BUNDLE{{.*}} {
+; KCFI-NEXT:    KCFI_CHECK_ARM $r0, 12345678
+; KCFI-NEXT:    TAILJMPr killed $r0, csr_aapcs, implicit $sp, implicit $sp
+; KCFI-NEXT:  }
+
+; ASM:       .long 12345678
+define void @f1(ptr noundef %x) !kcfi_type !1 {
+; ASM-LABEL: f1:
+; ASM:       @ %bb.0:
+; ASM-NEXT:    .save {r11, lr}
+; ASM-NEXT:    push {r11, lr}
+; ASM-NEXT:    bic r12, r0, #1
+; ASM-NEXT:    ldr r12, [r12, #-4]
+; ASM-NEXT:    eor r12, r12, #78
+; ASM-NEXT:    eor r12, r12, #24832
+; ASM-NEXT:    eor r12, r12, #12320768
+; ASM-NEXT:    eors r12, r12, #0
+; ASM-NEXT:    beq .Ltmp0
+; ASM-NEXT:    udf #33760
+; ASM-NEXT:  .Ltmp0:
+; ASM-NEXT:    blx r0
+; ASM-NEXT:    pop {r11, pc}
+
+  call void %x() [ "kcfi"(i32 12345678) ]
+  ret void
+}
+
+; Test with tail call
+define void @f2(ptr noundef %x) !kcfi_type !1 {
+; ASM-LABEL: f2:
+; ASM:       @ %bb.0:
+; ASM-NEXT:    bic r12, r0, #1
+; ASM-NEXT:    ldr r12, [r12, #-4]
+; ASM-NEXT:    eor r12, r12, #78
+; ASM-NEXT:    eor r12, r12, #24832
+; ASM-NEXT:    eor r12, r12, #12320768
+; ASM-NEXT:    eors r12, r12, #0
+; ASM-NEXT:    beq .Ltmp1
+; ASM-NEXT:    udf #33760
+; ASM-NEXT:  .Ltmp1:
+; ASM-NEXT:    bx r0
+
+  tail call void %x() [ "kcfi"(i32 12345678) ]
+  ret void
+}
+
+; Test r3 spill/reload when target is r12 and r3 is a call argument.
+; With 5+ arguments (target + 4 args), r0-r3 are all used for arguments,
+; forcing r3 to be spilled when we need it as scratch register.
+define void @f3_r3_spill(ptr noundef %target, i32 %a, i32 %b, i32 %c, i32 %d) !kcfi_type !1 {
+; ASM-LABEL: f3_r3_spill:
+; ASM:       @ %bb.0:
+; ASM-NEXT:    .save {r11, lr}
+; ASM-NEXT:    push {r11, lr}
+; ASM-NEXT:    mov lr, r3
+; ASM-NEXT:    ldr r3, [sp, #8]
+; ASM-NEXT:    mov r12, r0
+; ASM-NEXT:    mov r0, r1
+; ASM-NEXT:    mov r1, r2
+; ASM-NEXT:    mov r2, lr
+; ASM-NEXT:    stmdb sp!, {r3}
+; ASM-NEXT:    bic r3, r12, #1
+; ASM-NEXT:    ldr r3, [r3, #-4]
+; ASM-NEXT:    eor r3, r3, #78
+; ASM-NEXT:    eor r3, r3, #24832
+; ASM-NEXT:    eor r3, r3, #12320768
+; ASM-NEXT:    eors r3, r3, #0
+; ASM-NEXT:    ldm sp!, {r3}
+; ASM-NEXT:    beq .Ltmp2
+; ASM-NEXT:    udf #33772
+; ASM-NEXT:  .Ltmp2:
+; ASM-NEXT:    blx r12
+; ASM-NEXT:    pop {r11, pc}
+; Arguments: r0=%target, r1=%a, r2=%b, r3=%c, [sp]=%d
+; Call needs: r0=%a, r1=%b, r2=%c, r3=%d, target in r12
+; Compiler shuffles arguments into place, saving r3 (c) in lr, loading d from stack
+; r3 is live as 4th argument, so push it before KCFI check
+; Restore r3 immediately after comparison, before branch
+  call void %target(i32 %a, i32 %b, i32 %c, i32 %d) [ "kcfi"(i32 12345678) ]
+  ret void
+}
+
+; Test with 3 arguments - r3 not live, target in r12, so r3 used as scratch without spilling
+define void @f4_r3_unused(ptr noundef %target, i32 %a, i32 %b) !kcfi_type !1 {
+; ASM-LABEL: f4_r3_unused:
+; ASM:       @ %bb.0:
+; ASM-NEXT:    .save {r11, lr}
+; ASM-NEXT:    push {r11, lr}
+; ASM-NEXT:    mov r3, r0
+; ASM-NEXT:    mov r0, r1
+; ASM-NEXT:    mov r1, r2
+; ASM-NEXT:    bic r12, r3, #1
+; ASM-NEXT:    ldr r12, [r12, #-4]
+; ASM-NEXT:    eor r12, r12, #78
+; ASM-NEXT:    eor r12, r12, #24832
+; ASM-NEXT:    eor r12, r12, #12320768
+; ASM-NEXT:    eors r12, r12, #0
+; ASM-NEXT:    beq .Ltmp3
+; ASM-NEXT:    udf #33763
+; ASM-NEXT:  .Ltmp3:
+; ASM-NEXT:    blx r3
+; ASM-NEXT:    pop {r11, pc}
+; Only 3 arguments total, so r3 is not used as call argument
+; Compiler puts target→r3, a→r0, b→r1
+; r3 is the target, so we use r12 as scratch (no spill needed)
+  call void %target(i32 %a, i32 %b) [ "kcfi"(i32 12345678) ]
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"kcfi", i32 1}
+!1 = !{i32 12345678}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; ISEL: {{.*}}
+; KCFI: {{.*}}
+; MIR: {{.*}}
diff --git a/llvm/test/CodeGen/ARM/kcfi-cbz-range.ll b/llvm/test/CodeGen/ARM/kcfi-cbz-range.ll
new file mode 100644
index 0000000..8e71cae
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/kcfi-cbz-range.ll
@@ -0,0 +1,81 @@
+; RUN: llc -mtriple=thumbv7-linux-gnueabi -filetype=obj < %s
+; RUN: llc -mtriple=thumbv7-linux-gnueabi < %s | FileCheck %s
+
+; This test verifies that KCFI instrumentation doesn't cause "out of range
+; pc-relative fixup value" errors when generating object files.
+;
+; The test creates a scenario with enough KCFI-instrumented indirect calls
+; (~32 bytes each) that would push a cbz/cbnz instruction out of its ±126 byte
+; range if the KCFI_CHECK pseudo-instruction size is not properly accounted for.
+;
+; Without the fix (KCFI_CHECK returns size 0):
+;   - Backend thinks KCFI checks take no space
+;   - Generates cbz to branch over the code
+;   - During assembly, cbz target is >126 bytes away
+;   - Assembly fails with "error: out of range pc-relative fixup value"
+;
+; With the fix (KCFI_CHECK returns size 32 for Thumb2):
+;   - Backend correctly accounts for KCFI check expansion
+;   - Avoids cbz or uses longer-range branch instructions
+;   - Assembly succeeds, object file is generated
+
+declare void @external_function(i32)
+
+; Test WITHOUT KCFI: should generate cbz since calls are small
+; CHECK-LABEL: test_without_kcfi:
+; CHECK: cbz
+; CHECK-NOT: bic{{.*}}#1
+define i32 @test_without_kcfi(ptr %callback, i32 %x) {
+entry:
+  %cmp = icmp eq i32 %x, 0
+  br i1 %cmp, label %if_zero, label %if_nonzero
+
+if_nonzero:
+  ; Regular (non-KCFI) indirect calls - much smaller
+  call void %callback()
+  call void %callback()
+  call void %callback()
+  call void %callback()
+  call void %callback()
+  call void %callback()
+
+  call void @external_function(i32 %x)
+  %add1 = add i32 %x, 1
+  ret i32 %add1
+
+if_zero:
+  call void @external_function(i32 0)
+  ret i32 0
+}
+
+; Test WITH KCFI: should NOT generate cbz due to large KCFI checks
+; CHECK-LABEL: test_with_kcfi:
+; CHECK-NOT: cbz
+; CHECK: bic{{.*}}#1
+define i32 @test_with_kcfi(ptr %callback, i32 %x) !kcfi_type !1 {
+entry:
+  %cmp = icmp eq i32 %x, 0
+  br i1 %cmp, label %if_zero, label %if_nonzero
+
+if_nonzero:
+  ; Six KCFI-instrumented indirect calls (~192 bytes total, exceeds cbz range)
+  call void %callback() [ "kcfi"(i32 12345678) ]
+  call void %callback() [ "kcfi"(i32 12345678) ]
+  call void %callback() [ "kcfi"(i32 12345678) ]
+  call void %callback() [ "kcfi"(i32 12345678) ]
+  call void %callback() [ "kcfi"(i32 12345678) ]
+  call void %callback() [ "kcfi"(i32 12345678) ]
+
+  ; Regular call to prevent optimization
+  call void @external_function(i32 %x)
+  %add1 = add i32 %x, 1
+  ret i32 %add1
+
+if_zero:
+  call void @external_function(i32 0)
+  ret i32 0
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"kcfi", i32 1}
+!1 = !{i32 12345678}
diff --git a/llvm/test/CodeGen/ARM/kcfi-patchable-function-prefix.ll b/llvm/test/CodeGen/ARM/kcfi-patchable-function-prefix.ll
new file mode 100644
index 0000000..f8e0838
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/kcfi-patchable-function-prefix.ll
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=armv7-linux-gnueabi -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK:          .p2align 2
+; CHECK-NOT:        nop
+; CHECK:          .long   12345678
+define void @f1(ptr noundef %x) !kcfi_type !1 {
+; CHECK-LABEL: f1:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    bic r12, r0, #1
+; CHECK-NEXT:    ldr r12, [r12, #-4]
+; CHECK-NEXT:    eor r12, r12, #78
+; CHECK-NEXT:    eor r12, r12, #24832
+; CHECK-NEXT:    eor r12, r12, #12320768
+; CHECK-NEXT:    eors r12, r12, #0
+; CHECK-NEXT:    beq .Ltmp0
+; CHECK-NEXT:    udf #33760
+; CHECK-NEXT:  .Ltmp0:
+; CHECK-NEXT:    blx r0
+; CHECK-NEXT:    pop {r11, pc}
+  call void %x() [ "kcfi"(i32 12345678) ]
+  ret void
+}
+
+; CHECK:          .p2align 2
+; CHECK-NOT:       .long
+; CHECK-NOT:        nop
+define void @f2(ptr noundef %x) {
+; CHECK-LABEL: f2:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    bic r12, r0, #1
+; CHECK-NEXT:    ldr r12, [r12, #-4]
+; CHECK-NEXT:    eor r12, r12, #78
+; CHECK-NEXT:    eor r12, r12, #24832
+; CHECK-NEXT:    eor r12, r12, #12320768
+; CHECK-NEXT:    eors r12, r12, #0
+; CHECK-NEXT:    beq .Ltmp1
+; CHECK-NEXT:    udf #33760
+; CHECK-NEXT:  .Ltmp1:
+; CHECK-NEXT:    blx r0
+; CHECK-NEXT:    pop {r11, pc}
+  call void %x() [ "kcfi"(i32 12345678) ]
+  ret void
+}
+
+; CHECK:          .p2align 2
+; CHECK:          .long   12345678
+; CHECK-COUNT-11:   nop
+define void @f3(ptr noundef %x) #0 !kcfi_type !1 {
+; CHECK-LABEL: f3:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    bic r12, r0, #1
+; CHECK-NEXT:    ldr r12, [r12, #-48]
+; CHECK-NEXT:    eor r12, r12, #78
+; CHECK-NEXT:    eor r12, r12, #24832
+; CHECK-NEXT:    eor r12, r12, #12320768
+; CHECK-NEXT:    eors r12, r12, #0
+; CHECK-NEXT:    beq .Ltmp3
+; CHECK-NEXT:    udf #33760
+; CHECK-NEXT:  .Ltmp3:
+; CHECK-NEXT:    blx r0
+; CHECK-NEXT:    pop {r11, pc}
+  call void %x() [ "kcfi"(i32 12345678) ]
+  ret void
+}
+
+; CHECK:          .p2align 2
+; CHECK-COUNT-11:   nop
+define void @f4(ptr noundef %x) #0 {
+; CHECK-LABEL: f4:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    bic r12, r0, #1
+; CHECK-NEXT:    ldr r12, [r12, #-48]
+; CHECK-NEXT:    eor r12, r12, #78
+; CHECK-NEXT:    eor r12, r12, #24832
+; CHECK-NEXT:    eor r12, r12, #12320768
+; CHECK-NEXT:    eors r12, r12, #0
+; CHECK-NEXT:    beq .Ltmp5
+; CHECK-NEXT:    udf #33760
+; CHECK-NEXT:  .Ltmp5:
+; CHECK-NEXT:    blx r0
+; CHECK-NEXT:    pop {r11, pc}
+  call void %x() [ "kcfi"(i32 12345678) ]
+  ret void
+}
+
+attributes #0 = { "patchable-function-prefix"="11" }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"kcfi", i32 1}
+!1 = !{i32 12345678}
diff --git a/llvm/test/CodeGen/ARM/kcfi-thumb.ll b/llvm/test/CodeGen/ARM/kcfi-thumb.ll
new file mode 100644
index 0000000..7c02d830
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/kcfi-thumb.ll
@@ -0,0 +1,215 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=thumbv6m-none-eabi < %s | FileCheck %s
+
+; This test verifies that Thumb1 (ARMv6-M) generates correct code for backend KCFI.
+; Thumb1 uses the backend KCFI implementation with Thumb1-specific instructions.
+
+; Test function without KCFI annotation
+; CHECK-LABEL: .globl nosan
+; CHECK-NEXT:  .p2align 1
+; CHECK-NEXT:  .type nosan,%function
+; CHECK-NEXT:  .code 16
+; CHECK-NEXT:  .thumb_func
+define dso_local void @nosan() nounwind {
+; CHECK-LABEL: nosan:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    bx lr
+  ret void
+}
+
+; Test function with KCFI annotation - verifies type hash emission
+;; The alignment is at least 4 to avoid unaligned type hash loads when this
+;; instrumented function is indirectly called.
+; CHECK-LABEL: .globl target_func
+; CHECK-NEXT:  .p2align 2
+; CHECK-NEXT:  .type target_func,%function
+; CHECK-NEXT:  .long 3170468932
+; CHECK-NEXT:  .code 16
+; CHECK-NEXT:  .thumb_func
+define void @target_func() !kcfi_type !1 {
+; CHECK-LABEL: target_func:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    bx lr
+  ret void
+}
+
+; Test indirect call with KCFI check using operand bundles
+; CHECK-LABEL: .globl f1
+; CHECK:       .p2align 2
+; CHECK-NEXT:  .type f1,%function
+; CHECK-NEXT:  .long 3170468932
+; CHECK-NEXT:  .code 16
+; CHECK-NEXT:  .thumb_func
+define void @f1(ptr noundef %x) !kcfi_type !1 {
+; CHECK-LABEL: f1:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    movs r3, #1
+; CHECK-NEXT:    mov r2, r0
+; CHECK-NEXT:    bics r2, r3
+; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    ldr r2, [r2]
+; CHECK-NEXT:    movs r3, #188
+; CHECK-NEXT:    lsls r3, r3, #8
+; CHECK-NEXT:    adds r3, #249
+; CHECK-NEXT:    lsls r3, r3, #8
+; CHECK-NEXT:    adds r3, #132
+; CHECK-NEXT:    lsls r3, r3, #8
+; CHECK-NEXT:    adds r3, #68
+; CHECK-NEXT:    cmp r2, r3
+; CHECK-NEXT:    beq .Ltmp0
+; CHECK-NEXT:    bkpt #0
+; CHECK-NEXT:  .Ltmp0:
+; CHECK-NEXT:    blx r0
+; CHECK-NEXT:    pop {r7, pc}
+  call void %x() [ "kcfi"(i32 -1124498364) ]
+  ret void
+}
+
+; Test with tail call - backend KCFI supports tail calls
+define void @f2(ptr noundef %x) !kcfi_type !1 {
+; CHECK-LABEL: f2:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    movs r3, #1
+; CHECK-NEXT:    mov r2, r0
+; CHECK-NEXT:    bics r2, r3
+; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    ldr r2, [r2]
+; CHECK-NEXT:    movs r3, #188
+; CHECK-NEXT:    lsls r3, r3, #8
+; CHECK-NEXT:    adds r3, #249
+; CHECK-NEXT:    lsls r3, r3, #8
+; CHECK-NEXT:    adds r3, #132
+; CHECK-NEXT:    lsls r3, r3, #8
+; CHECK-NEXT:    adds r3, #68
+; CHECK-NEXT:    cmp r2, r3
+; CHECK-NEXT:    beq .Ltmp1
+; CHECK-NEXT:    bkpt #0
+; CHECK-NEXT:  .Ltmp1:
+; CHECK-NEXT:    blx r0
+; CHECK-NEXT:    pop {r7, pc}
+  tail call void %x() [ "kcfi"(i32 -1124498364) ]
+  ret void
+}
+
+; Test with R2 live (3 arguments) - compiler shuffles args, no spilling needed
+define void @f3_r2_live(ptr noundef %x, i32 %a, i32 %b, i32 %c) !kcfi_type !1 {
+; CHECK-LABEL: f3_r2_live:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    mov r1, r2
+; CHECK-NEXT:    mov r2, r3
+; CHECK-NEXT:    push {r2}
+; CHECK-NEXT:    movs r3, #1
+; CHECK-NEXT:    mov r2, r4
+; CHECK-NEXT:    bics r2, r3
+; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    ldr r2, [r2]
+; CHECK-NEXT:    movs r3, #188
+; CHECK-NEXT:    lsls r3, r3, #8
+; CHECK-NEXT:    adds r3, #249
+; CHECK-NEXT:    lsls r3, r3, #8
+; CHECK-NEXT:    adds r3, #132
+; CHECK-NEXT:    lsls r3, r3, #8
+; CHECK-NEXT:    adds r3, #68
+; CHECK-NEXT:    cmp r2, r3
+; CHECK-NEXT:    pop {r2}
+; CHECK-NEXT:    beq .Ltmp2
+; CHECK-NEXT:    bkpt #0
+; CHECK-NEXT:  .Ltmp2:
+; CHECK-NEXT:    blx r4
+; CHECK-NEXT:    pop {r4, pc}
+; Compiler shuffles: target→r4, c→r2, a→r0, b→r1
+; R2 is live (3rd arg), so we push it, then uses R3 as temp, R2 as scratch
+  call void %x(i32 %a, i32 %b, i32 %c) [ "kcfi"(i32 -1124498364) ]
+  ret void
+}
+
+; Test with both R2 and R3 live (4 arguments) - compiler moves to r5/r4, uses R3 temp and R12 scratch
+define void @f4_r2_r3_live(ptr noundef %x, i32 %a, i32 %b, i32 %c, i32 %d) !kcfi_type !1 {
+; CHECK-LABEL: f4_r2_r3_live:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    mov r5, r3
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    ldr r3, [sp, #16]
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    mov r1, r2
+; CHECK-NEXT:    mov r2, r5
+; CHECK-NEXT:    push {r3}
+; CHECK-NEXT:    push {r2}
+; CHECK-NEXT:    movs r3, #1
+; CHECK-NEXT:    mov r2, r4
+; CHECK-NEXT:    bics r2, r3
+; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    ldr r2, [r2]
+; CHECK-NEXT:    movs r3, #188
+; CHECK-NEXT:    lsls r3, r3, #8
+; CHECK-NEXT:    adds r3, #249
+; CHECK-NEXT:    lsls r3, r3, #8
+; CHECK-NEXT:    adds r3, #132
+; CHECK-NEXT:    lsls r3, r3, #8
+; CHECK-NEXT:    adds r3, #68
+; CHECK-NEXT:    cmp r2, r3
+; CHECK-NEXT:    pop {r2}
+; CHECK-NEXT:    pop {r3}
+; CHECK-NEXT:    beq .Ltmp3
+; CHECK-NEXT:    bkpt #0
+; CHECK-NEXT:  .Ltmp3:
+; CHECK-NEXT:    blx r4
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; Compiler shuffles: r3→r5, target→r4, d→r3 (from stack), a→r0, b→r1, c→r2
+; Then pushes r3 (d value), then r2, uses R3 as temp, R2 as scratch
+  call void %x(i32 %a, i32 %b, i32 %c, i32 %d) [ "kcfi"(i32 -1124498364) ]
+  ret void
+}
+
+; Test where target ends up in R12, forcing R2 as scratch, with both R2 and R3 live
+; This uses inline asm to force target into R12, with 4 call arguments to make R2/R3 live
+define void @f5_r12_target_r2_r3_live(i32 %a, i32 %b, i32 %c, i32 %d) !kcfi_type !1 {
+; CHECK-LABEL: f5_r12_target_r2_r3_live:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    @APP
+; CHECK-NEXT:    @NO_APP
+; CHECK-NEXT:    push {r3}
+; CHECK-NEXT:    push {r2}
+; CHECK-NEXT:    movs r3, #1
+; CHECK-NEXT:    mov r2, r12
+; CHECK-NEXT:    bics r2, r3
+; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    ldr r2, [r2]
+; CHECK-NEXT:    movs r3, #188
+; CHECK-NEXT:    lsls r3, r3, #8
+; CHECK-NEXT:    adds r3, #249
+; CHECK-NEXT:    lsls r3, r3, #8
+; CHECK-NEXT:    adds r3, #132
+; CHECK-NEXT:    lsls r3, r3, #8
+; CHECK-NEXT:    adds r3, #68
+; CHECK-NEXT:    cmp r2, r3
+; CHECK-NEXT:    pop {r2}
+; CHECK-NEXT:    pop {r3}
+; CHECK-NEXT:    beq .Ltmp4
+; CHECK-NEXT:    bkpt #0
+; CHECK-NEXT:  .Ltmp4:
+; CHECK-NEXT:    blx r12
+; CHECK-NEXT:    pop {r7, pc}
+; Use inline asm to get function pointer into R12
+; With 4 arguments (r0-r3), both R2 and R3 are live
+; Target in R12 means R2 is scratch, R3 is temp, and both need spilling
+  %target = call ptr asm "", "={r12}"()
+  call void %target(i32 %a, i32 %b, i32 %c, i32 %d) [ "kcfi"(i32 -1124498364) ]
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"kcfi", i32 1}
+!1 = !{i32 -1124498364}
diff --git a/llvm/test/CodeGen/ARM/kcfi-thumb2.ll b/llvm/test/CodeGen/ARM/kcfi-thumb2.ll
new file mode 100644
index 0000000..f319d98
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/kcfi-thumb2.ll
@@ -0,0 +1,163 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=thumbv7-linux-gnueabi -verify-machineinstrs < %s | FileCheck %s --check-prefix=ASM
+; RUN: llc -mtriple=thumbv7-linux-gnueabi -verify-machineinstrs -stop-after=finalize-isel < %s | FileCheck %s --check-prefixes=MIR,ISEL
+; RUN: llc -mtriple=thumbv7-linux-gnueabi -verify-machineinstrs -stop-after=kcfi < %s | FileCheck %s --check-prefixes=MIR,KCFI
+
+; MIR checks for all functions (grouped here to prevent update_llc_test_checks.py from removing them)
+
+; MIR-LABEL: name: f1
+; MIR: body:
+
+; ISEL:     tBLXr 14 /* CC::al */, $noreg, %0, csr_aapcs,{{.*}} cfi-type 12345678
+
+; KCFI:       BUNDLE{{.*}} {
+; KCFI-NEXT:    KCFI_CHECK_Thumb2 $r0, 12345678
+; KCFI-NEXT:    tBLXr 14 /* CC::al */, $noreg, {{(killed )?}}$r0, csr_aapcs,{{.*}}
+; KCFI-NEXT:  }
+
+; MIR-LABEL: name: f2
+; MIR: body:
+
+; ISEL:     TCRETURNri %0, 0, csr_aapcs, implicit $sp, cfi-type 12345678
+
+; KCFI:       BUNDLE{{.*}} {
+; KCFI-NEXT:    KCFI_CHECK_Thumb2 $r0, 12345678
+; KCFI-NEXT:    tTAILJMPr {{(killed )?}}$r0, csr_aapcs, implicit $sp, implicit $sp
+; KCFI-NEXT:  }
+
+; Test function without KCFI annotation
+; ASM-LABEL: .globl nosan
+; ASM-NEXT:  .p2align 1
+; ASM-NEXT:  .type nosan,%function
+; ASM-NEXT:  .code 16
+; ASM-NEXT:  .thumb_func
+define dso_local void @nosan() nounwind {
+; ASM-LABEL: nosan:
+; ASM:       @ %bb.0:
+; ASM-NEXT:    bx lr
+  ret void
+}
+
+; Test function with KCFI annotation - verifies type hash emission
+;; The alignment is at least 4 to avoid unaligned type hash loads when this
+;; instrumented function is indirectly called.
+; ASM-LABEL: .globl target_func
+; ASM-NEXT:  .p2align 2
+; ASM-NEXT:  .type target_func,%function
+; ASM-NEXT:  .long 12345678
+; ASM-NEXT:  .code 16
+; ASM-NEXT:  .thumb_func
+define void @target_func() !kcfi_type !1 {
+; ASM-LABEL: target_func:
+; ASM:       @ %bb.0:
+; ASM-NEXT:    bx lr
+  ret void
+}
+
+; Test indirect call with KCFI check
+; ASM:       .long 12345678
+define void @f1(ptr noundef %x) !kcfi_type !1 {
+; ASM-LABEL: f1:
+; ASM:       @ %bb.0:
+; ASM-NEXT:    .save {r7, lr}
+; ASM-NEXT:    push {r7, lr}
+; ASM-NEXT:    bic r12, r0, #1
+; ASM-NEXT:    ldr r12, [r12, #-4]
+; ASM-NEXT:    eor r12, r12, #78
+; ASM-NEXT:    eor r12, r12, #24832
+; ASM-NEXT:    eor r12, r12, #12320768
+; ASM-NEXT:    eors r12, r12, #0
+; ASM-NEXT:    beq.w .Ltmp0
+; ASM-NEXT:    udf #128
+; ASM-NEXT:  .Ltmp0:
+; ASM-NEXT:    blx r0
+; ASM-NEXT:    pop {r7, pc}
+
+  call void %x() [ "kcfi"(i32 12345678) ]
+  ret void
+}
+
+; Test with tail call
+define void @f2(ptr noundef %x) !kcfi_type !1 {
+; ASM-LABEL: f2:
+; ASM:       @ %bb.0:
+; ASM-NEXT:    bic r12, r0, #1
+; ASM-NEXT:    ldr r12, [r12, #-4]
+; ASM-NEXT:    eor r12, r12, #78
+; ASM-NEXT:    eor r12, r12, #24832
+; ASM-NEXT:    eor r12, r12, #12320768
+; ASM-NEXT:    eors r12, r12, #0
+; ASM-NEXT:    beq.w .Ltmp1
+; ASM-NEXT:    udf #128
+; ASM-NEXT:  .Ltmp1:
+; ASM-NEXT:    bx r0
+
+  tail call void %x() [ "kcfi"(i32 12345678) ]
+  ret void
+}
+
+; Test r3 spill/reload when target is r12 and r3 is a call argument (Thumb2)
+define void @f3_r3_spill(ptr noundef %target, i32 %a, i32 %b, i32 %c, i32 %d) !kcfi_type !1 {
+; ASM-LABEL: f3_r3_spill:
+; ASM:       @ %bb.0:
+; ASM-NEXT:    .save {r7, lr}
+; ASM-NEXT:    push {r7, lr}
+; ASM-NEXT:    mov lr, r3
+; ASM-NEXT:    ldr r3, [sp, #8]
+; ASM-NEXT:    mov r12, r0
+; ASM-NEXT:    mov r0, r1
+; ASM-NEXT:    mov r1, r2
+; ASM-NEXT:    mov r2, lr
+; ASM-NEXT:    push {r3}
+; ASM-NEXT:    bic r3, r12, #1
+; ASM-NEXT:    ldr r3, [r3, #-4]
+; ASM-NEXT:    eor r3, r3, #78
+; ASM-NEXT:    eor r3, r3, #24832
+; ASM-NEXT:    eor r3, r3, #12320768
+; ASM-NEXT:    eors r3, r3, #0
+; ASM-NEXT:    pop {r3}
+; ASM-NEXT:    beq.w .Ltmp2
+; ASM-NEXT:    udf #140
+; ASM-NEXT:  .Ltmp2:
+; ASM-NEXT:    blx r12
+; ASM-NEXT:    pop {r7, pc}
+; Arguments: r0=%target, r1=%a, r2=%b, r3=%c, [sp+8]=%d
+; Call needs: r0=%a, r1=%b, r2=%c, r3=%d, target in r12
+; r3 is live as 4th argument, so push it before KCFI check
+  call void %target(i32 %a, i32 %b, i32 %c, i32 %d) [ "kcfi"(i32 12345678) ]
+  ret void
+}
+
+; Test with 3 arguments - r3 not live, target in r12 or elsewhere, r12 used as scratch
+define void @f4_r3_unused(ptr noundef %target, i32 %a, i32 %b) !kcfi_type !1 {
+; ASM-LABEL: f4_r3_unused:
+; ASM:       @ %bb.0:
+; ASM-NEXT:    .save {r7, lr}
+; ASM-NEXT:    push {r7, lr}
+; ASM-NEXT:    mov r3, r0
+; ASM-NEXT:    mov r0, r1
+; ASM-NEXT:    mov r1, r2
+; ASM-NEXT:    bic r12, r3, #1
+; ASM-NEXT:    ldr r12, [r12, #-4]
+; ASM-NEXT:    eor r12, r12, #78
+; ASM-NEXT:    eor r12, r12, #24832
+; ASM-NEXT:    eor r12, r12, #12320768
+; ASM-NEXT:    eors r12, r12, #0
+; ASM-NEXT:    beq.w .Ltmp3
+; ASM-NEXT:    udf #131
+; ASM-NEXT:  .Ltmp3:
+; ASM-NEXT:    blx r3
+; ASM-NEXT:    pop {r7, pc}
+; Only 3 arguments total, so r3 is not used as call argument
+; Target might be in r3, using r12 as scratch (no spill needed)
+  call void %target(i32 %a, i32 %b) [ "kcfi"(i32 12345678) ]
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"kcfi", i32 1}
+!1 = !{i32 12345678}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; ISEL: {{.*}}
+; KCFI: {{.*}}
+; MIR: {{.*}}
diff --git a/llvm/test/CodeGen/ARM/kcfi.ll b/llvm/test/CodeGen/ARM/kcfi.ll
deleted file mode 100644
index 9e16468..0000000
--- a/llvm/test/CodeGen/ARM/kcfi.ll
+++ /dev/null
@@ -1,28 +0,0 @@
-; RUN: llc -mtriple=thumbv6m-none-eabi < %s | FileCheck %s
-
-; CHECK-LABEL: .globl nosan
-; CHECK-NEXT:  .p2align 1
-; CHECK-NEXT:  .type nosan,%function
-; CHECK-NEXT:  .code 16
-; CHECK-NEXT:  .thumb_func
-; CHECK-NEXT:  nosan:
-define dso_local void @nosan() nounwind {
-  ret void
-}
-
-;; The alignment is at least 4 to avoid unaligned type hash loads when this
-;; instrumented function is indirectly called.
-; CHECK-LABEL: .globl f1
-; CHECK-NEXT:  .p2align 2
-; CHECK-NEXT:  .type f1,%function
-; CHECK-NEXT:  .long 3170468932
-; CHECK-NEXT:  .code 16
-; CHECK-NEXT:  .thumb_func
-; CHECK-NEXT:  f1:
-define void @f1(ptr noundef %x) !kcfi_type !1 {
-  ret void
-}
-
-!llvm.module.flags = !{!0}
-!0 = !{i32 4, !"kcfi", i32 1}
-!1 = !{i32 -1124498364}
diff --git a/llvm/test/CodeGen/ARM/out-of-registers.ll b/llvm/test/CodeGen/ARM/out-of-registers.ll
index c6488f1..8da2069 100644
--- a/llvm/test/CodeGen/ARM/out-of-registers.ll
+++ b/llvm/test/CodeGen/ARM/out-of-registers.ll
@@ -32,7 +32,7 @@ declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vl
 
 ; Function Attrs: nounwind
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 attributes #2 = { nounwind readonly }
 
diff --git a/llvm/test/CodeGen/ARM/relax-per-target-feature.ll b/llvm/test/CodeGen/ARM/relax-per-target-feature.ll
index 71db294..99ed6f3 100644
--- a/llvm/test/CodeGen/ARM/relax-per-target-feature.ll
+++ b/llvm/test/CodeGen/ARM/relax-per-target-feature.ll
@@ -30,5 +30,5 @@ entry:
 
 attributes #0 = { nounwind  "disable-tail-calls"="false" "target-cpu"="cortex-a53" "target-features"="+crypto,+fp-armv8,+neon,+soft-float-abi,+strict-align,+thumb-mode,-crc,-dotprod,-dsp,-hwdiv,-hwdiv-arm,-ras" "use-soft-float"="true" }
 
-attributes #2 = { nounwind  "disable-tail-calls"="false" "target-cpu"="arm7tdmi" "target-features"="+strict-align,+thumb-mode,-crc,-dotprod,-dsp,-hwdiv,-hwdiv-arm,-ras" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #2 = { nounwind  "disable-tail-calls"="false" "target-cpu"="arm7tdmi" "target-features"="+strict-align,+thumb-mode,-crc,-dotprod,-dsp,-hwdiv,-hwdiv-arm,-ras" "use-soft-float"="true" }
 attributes #3 = { nounwind }
diff --git a/llvm/test/CodeGen/ARM/softfp-constant-comparison.ll b/llvm/test/CodeGen/ARM/softfp-constant-comparison.ll
index 76df93b..2aa7611 100644
--- a/llvm/test/CodeGen/ARM/softfp-constant-comparison.ll
+++ b/llvm/test/CodeGen/ARM/softfp-constant-comparison.ll
@@ -32,4 +32,4 @@ land.end:                                         ; preds = %land.rhs, %entry
   ret void
 }
 
-attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m4" "target-features"="+armv7e-m,+dsp,+fp16,+hwdiv,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp,-aes,-crc,-crypto,-dotprod,-fp16fml,-fullfp16,-hwdiv-arm,-lob,-mve,-mve.fp,-ras,-sb,-sha2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m4" "target-features"="+armv7e-m,+dsp,+fp16,+hwdiv,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp,-aes,-crc,-crypto,-dotprod,-fp16fml,-fullfp16,-hwdiv-arm,-lob,-mve,-mve.fp,-ras,-sb,-sha2" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/ARM/stack-protector-bmovpcb_call.ll b/llvm/test/CodeGen/ARM/stack-protector-bmovpcb_call.ll
index 6f2cb42..2cf6d29 100644
--- a/llvm/test/CodeGen/ARM/stack-protector-bmovpcb_call.ll
+++ b/llvm/test/CodeGen/ARM/stack-protector-bmovpcb_call.ll
@@ -25,7 +25,7 @@ declare void @llvm.memcpy.p0.p0.i32(ptr nocapture, ptr nocapture readonly, i32,
 ; Function Attrs: nounwind optsize
 declare i32 @printf(ptr nocapture readonly, ...) #2
 
-attributes #0 = { nounwind optsize ssp "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind optsize ssp "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind }
-attributes #2 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #3 = { nounwind optsize }
diff --git a/llvm/test/CodeGen/ARM/stack_guard_remat.ll b/llvm/test/CodeGen/ARM/stack_guard_remat.ll
index 983ef13..0930ccc 100644
--- a/llvm/test/CodeGen/ARM/stack_guard_remat.ll
+++ b/llvm/test/CodeGen/ARM/stack_guard_remat.ll
@@ -68,7 +68,7 @@ declare void @foo3(ptr)
 ; Function Attrs: nounwind
 declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
 
-attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 ;--- pic-flag.ll
 !llvm.module.flags = !{!0}
diff --git a/llvm/test/CodeGen/ARM/struct-byval-frame-index.ll b/llvm/test/CodeGen/ARM/struct-byval-frame-index.ll
index 24df0d3..868dc03 100644
--- a/llvm/test/CodeGen/ARM/struct-byval-frame-index.ll
+++ b/llvm/test/CodeGen/ARM/struct-byval-frame-index.ll
@@ -34,4 +34,4 @@ entry:
 ; Function Attrs: nounwind
 declare void @RestoreMVBlock8x8(i32, i32, ptr byval(%structN) nocapture, i32) #1
 
-attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/ARM/subtarget-align.ll b/llvm/test/CodeGen/ARM/subtarget-align.ll
index a24b487..f87e21f 100644
--- a/llvm/test/CodeGen/ARM/subtarget-align.ll
+++ b/llvm/test/CodeGen/ARM/subtarget-align.ll
@@ -18,7 +18,7 @@ entry:
   ret i32 0
 }
 
-attributes #0 = { "target-cpu"="generic" "target-features"="+armv7-a,+dsp,+neon,+vfp3,-thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "target-cpu"="generic" "target-features"="+armv7-a,+dsp,+neon,+vfp3,-thumb-mode" "use-soft-float"="false" }
 attributes #1 = { "target-cpu"="arm7tdmi" "target-features"="+armv4t" "use-soft-float"="true" }
 
 
diff --git a/llvm/test/CodeGen/ARM/unschedule-first-call.ll b/llvm/test/CodeGen/ARM/unschedule-first-call.ll
index e0bb787..ad422f7 100644
--- a/llvm/test/CodeGen/ARM/unschedule-first-call.ll
+++ b/llvm/test/CodeGen/ARM/unschedule-first-call.ll
@@ -128,7 +128,7 @@ declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) #1
 ; Function Attrs: nounwind readnone
 declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) #1
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-cpu"="arm1176jzf-s" "target-features"="+dsp,+strict-align,+vfp2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-cpu"="arm1176jzf-s" "target-features"="+dsp,+strict-align,+vfp2" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 
 !llvm.ident = !{!0}
diff --git a/llvm/test/CodeGen/ARM/vector-spilling.ll b/llvm/test/CodeGen/ARM/vector-spilling.ll
index 5dc20a8..8d1339844 100644
--- a/llvm/test/CodeGen/ARM/vector-spilling.ll
+++ b/llvm/test/CodeGen/ARM/vector-spilling.ll
@@ -30,4 +30,4 @@ entry:
 
 declare void @foo(<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>)
 
-attributes #0 = { noredzone "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noredzone "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/ARM/vldm-sched-a9.ll b/llvm/test/CodeGen/ARM/vldm-sched-a9.ll
index 892b261..4e36711 100644
--- a/llvm/test/CodeGen/ARM/vldm-sched-a9.ll
+++ b/llvm/test/CodeGen/ARM/vldm-sched-a9.ll
@@ -132,4 +132,4 @@ entry:
 
 declare void @capture(ptr, ptr)
 
-attributes #0 = { noredzone "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noredzone "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/AVR/dynalloca.ll b/llvm/test/CodeGen/AVR/dynalloca.ll
index 3face71..b32910b 100644
--- a/llvm/test/CodeGen/AVR/dynalloca.ll
+++ b/llvm/test/CodeGen/AVR/dynalloca.ll
@@ -64,16 +64,16 @@ define void @dynalloca2(i16 %x) {
 ; CHECK-NEXT: out 63, r0
 ; CHECK-NEXT: out 61, {{.*}}
 ; Store values on the stack
-; CHECK: ldi r20, 0
-; CHECK: ldi r21, 0
-; CHECK: std Z+8, r21
-; CHECK: std Z+7, r20
-; CHECK: std Z+6, r21
-; CHECK: std Z+5, r20
-; CHECK: std Z+4, r21
-; CHECK: std Z+3, r20
-; CHECK: std Z+2, r21
-; CHECK: std Z+1, r20
+; CHECK: ldi [[REG1:r[0-9]+]], 0
+; CHECK: ldi [[REG2:r[0-9]+]], 0
+; CHECK: std Z+8, [[REG2]]
+; CHECK: std Z+7, [[REG1]]
+; CHECK: std Z+6, [[REG2]]
+; CHECK: std Z+5, [[REG1]]
+; CHECK: std Z+4, [[REG2]]
+; CHECK: std Z+3, [[REG1]]
+; CHECK: std Z+2, [[REG2]]
+; CHECK: std Z+1, [[REG1]]
 ; CHECK: call
 ; Call frame restore
 ; CHECK-NEXT: in r30, 61
diff --git a/llvm/test/CodeGen/AVR/issue-163015.ll b/llvm/test/CodeGen/AVR/issue-163015.ll
new file mode 100644
index 0000000..6c4dc51
--- /dev/null
+++ b/llvm/test/CodeGen/AVR/issue-163015.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -mtriple=avr | FileCheck %s
+
+@ui1 = protected local_unnamed_addr global i64 zeroinitializer, align 8
+@ui2 = protected local_unnamed_addr global i64 zeroinitializer, align 8
+@failed = private unnamed_addr addrspace(1) constant [12 x i8] c"test failed\00"
+@stats2 = external protected global i16, align 1
+
+; CHECK-LABEL: main:
+define i32 @main() addrspace(1) {
+entry:
+  store i64 94, ptr @ui1, align 8
+  store i64 53, ptr @ui2, align 8
+  tail call addrspace(1) void @foo(i16 ptrtoint (ptr addrspace(1) @failed to i16), i16 11, i8 2, i16 32, ptr @stats2)
+  %11 = load i64, ptr @ui1, align 8
+  %12 = load i64, ptr @ui2, align 8
+
+; COM: CHECK: call __udivdi3
+  %15 = udiv i64 %11, %12
+
+; look for the buggy pattern where r30/r31 are being clobbered, corrupting the stack pointer
+; CHECK-NOT: std  Z+{{[1-9]+}}, r30 
+; CHECK-NOT: std  Z+{{[1-9]+}}, r31
+
+; CHECK: call expect
+  tail call addrspace(1) void @expect(i64 %15, i64 1, i16 ptrtoint (ptr addrspace(1) @failed to i16), i16 11, i8 2, i16 33)
+
+; CHECK: ret
+  ret i32 0
+}
+
+declare protected void @expect(i64, i64, i16, i16, i8, i16) local_unnamed_addr addrspace(1) #0
+declare protected void @foo(i16, i16, i8, i16, i16) local_unnamed_addr addrspace(1) #0
diff --git a/llvm/test/CodeGen/BPF/BTF/binary-format.ll b/llvm/test/CodeGen/BPF/BTF/binary-format.ll
index 3b1be1a..fd09566 100644
--- a/llvm/test/CodeGen/BPF/BTF/binary-format.ll
+++ b/llvm/test/CodeGen/BPF/BTF/binary-format.ll
@@ -7,7 +7,7 @@
 ;   clang -target bpf -O2 -g -gdwarf-5 -gembed-source -S -emit-llvm t.c
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @f(i32 returned %a) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @f(i32 returned %a) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata i32 %a, metadata !12, metadata !DIExpression()), !dbg !13
   ret i32 %a, !dbg !14
@@ -42,10 +42,7 @@ entry:
 ; CHECK-EB: 0x00000050 00000008 0000000f 00000018 00000410
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/builtin-btf-type-id.ll b/llvm/test/CodeGen/BPF/BTF/builtin-btf-type-id.ll
index 2fb8d25..1672334 100644
--- a/llvm/test/CodeGen/BPF/BTF/builtin-btf-type-id.ll
+++ b/llvm/test/CodeGen/BPF/BTF/builtin-btf-type-id.ll
@@ -24,7 +24,7 @@
 @bpf_log = internal global ptr inttoptr (i64 999 to ptr), align 8, !dbg !17
 
 ; Function Attrs: nounwind
-define dso_local void @prog1() #0 !dbg !28 {
+define dso_local void @prog1() !dbg !28 {
 entry:
   %0 = load ptr, ptr @bpf_log, align 8, !dbg !31, !tbaa !32
   %1 = call i64 @llvm.bpf.btf.type.id(i32 0, i64 0), !dbg !36, !llvm.preserve.access.index !7
@@ -33,10 +33,10 @@ entry:
 }
 
 ; Function Attrs: nounwind readnone
-declare i64 @llvm.bpf.btf.type.id(i32, i64) #1
+declare i64 @llvm.bpf.btf.type.id(i32, i64)
 
 ; Function Attrs: nounwind
-define dso_local void @prog2() #0 !dbg !38 {
+define dso_local void @prog2() !dbg !38 {
 entry:
   %0 = load ptr, ptr @bpf_log, align 8, !dbg !39, !tbaa !32
   %1 = call i64 @llvm.bpf.btf.type.id(i32 1, i64 0), !dbg !40, !llvm.preserve.access.index !6
@@ -45,7 +45,7 @@ entry:
 }
 
 ; Function Attrs: nounwind
-define dso_local void @prog3() #0 !dbg !42 {
+define dso_local void @prog3() !dbg !42 {
 entry:
   %0 = load ptr, ptr @bpf_log, align 8, !dbg !43, !tbaa !32
   %1 = call i64 @llvm.bpf.btf.type.id(i32 2, i64 1), !dbg !44, !llvm.preserve.access.index !11
@@ -96,9 +96,6 @@ entry:
 ; CHECK-NEXT:        .long   48
 ; CHECK-NEXT:        .long   7
 
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!24, !25, !26}
 !llvm.ident = !{!27}
diff --git a/llvm/test/CodeGen/BPF/BTF/char-no-debuginfo.ll b/llvm/test/CodeGen/BPF/BTF/char-no-debuginfo.ll
index cc14a32b..1c2b1d1 100644
--- a/llvm/test/CodeGen/BPF/BTF/char-no-debuginfo.ll
+++ b/llvm/test/CodeGen/BPF/BTF/char-no-debuginfo.ll
@@ -10,7 +10,7 @@
 @g = dso_local local_unnamed_addr global i32 5, section "maps", align 4
 
 ; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @test() local_unnamed_addr #0 {
+define dso_local i32 @test() local_unnamed_addr {
   %1 = load i32, ptr @g, align 4, !tbaa !2
   ret i32 %1
 }
@@ -18,8 +18,6 @@ define dso_local i32 @test() local_unnamed_addr #0 {
 ; CHECK-NOT:         .section        .BTF
 ; CHECK-NOT:         .section        .BTF.ext
 
-attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
 
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-builtin.ll b/llvm/test/CodeGen/BPF/BTF/extern-builtin.ll
index a855016..fa0aa5b 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-builtin.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-builtin.ll
@@ -10,7 +10,7 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm test.c
 
 ; Function Attrs: nounwind readonly
-define dso_local i64 @test(ptr readonly %skb) local_unnamed_addr #0 !dbg !13 {
+define dso_local i64 @test(ptr readonly %skb) local_unnamed_addr !dbg !13 {
 entry:
   call void @llvm.dbg.value(metadata ptr %skb, metadata !17, metadata !DIExpression()), !dbg !18
   %call = tail call i64 @llvm.bpf.load.byte(ptr %skb, i64 10), !dbg !19
@@ -54,13 +54,9 @@ entry:
 ; CHECK-NEXT:        .byte   0
 
 ; Function Attrs: nounwind readonly
-declare !dbg !4 i64 @llvm.bpf.load.byte(ptr, i64) #1
+declare !dbg !4 i64 @llvm.bpf.load.byte(ptr, i64)
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readonly }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-func-arg.ll b/llvm/test/CodeGen/BPF/BTF/extern-func-arg.ll
index b7cbb48f..9a31beb 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-func-arg.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-func-arg.ll
@@ -8,9 +8,9 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm test.c
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test() local_unnamed_addr !dbg !13 {
 entry:
-  %call = tail call i32 @global_func(i8 signext 0) #2, !dbg !16
+  %call = tail call i32 @global_func(i8 signext 0), !dbg !16
   ret i32 %call, !dbg !17
 }
 
@@ -49,11 +49,7 @@ entry:
 ; CHECK:             .ascii  "char"                  # string offset=55
 ; CHECK:             .ascii  "global_func"           # string offset=60
 
-declare !dbg !4 dso_local i32 @global_func(i8 signext) local_unnamed_addr #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
+declare !dbg !4 dso_local i32 @global_func(i8 signext) local_unnamed_addr
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-global-var.ll b/llvm/test/CodeGen/BPF/BTF/extern-global-var.ll
index 299aa1d..c3f93ab 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-global-var.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-global-var.ll
@@ -10,7 +10,7 @@
 @a = external dso_local local_unnamed_addr global i8, align 1
 
 ; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @foo() local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @foo() local_unnamed_addr !dbg !7 {
   %1 = load i8, ptr @a, align 1, !dbg !11, !tbaa !12
   %2 = sext i8 %1 to i32, !dbg !11
   ret i32 %2, !dbg !15
@@ -45,8 +45,6 @@ define dso_local i32 @foo() local_unnamed_addr #0 !dbg !7 {
 ; CHECK-NEXT:        .ascii  "/home/yhs/work/tests/llvm/bug/test.c" # string offset=15
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak-section.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak-section.ll
index d11addd..0ddd634 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak-section.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak-section.ll
@@ -10,12 +10,12 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm test.c
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test() local_unnamed_addr !dbg !13 {
 entry:
-  %call = tail call i32 @global_func(i8 signext 0) #2, !dbg !16
+  %call = tail call i32 @global_func(i8 signext 0), !dbg !16
   ret i32 %call, !dbg !17
 }
-declare !dbg !4 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed_addr #1 section "abc"
+declare !dbg !4 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed_addr section "abc"
 
 ; CHECK:             .section        .BTF,"",@progbits
 ; CHECK-NEXT:        .short  60319                   # 0xeb9f
@@ -69,10 +69,6 @@ declare !dbg !4 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed
 ; CHECK-NEXT:        .byte   0
 ; CHECK-NEXT:        .ascii  "abc"                   # string offset=72
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
 !llvm.ident = !{!12}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak.ll
index 9e82295..fbfc03b 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak.ll
@@ -10,12 +10,12 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm test.c
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test() local_unnamed_addr !dbg !13 {
 entry:
-  %call = tail call i32 @global_func(i8 signext 0) #2, !dbg !16
+  %call = tail call i32 @global_func(i8 signext 0), !dbg !16
   ret i32 %call, !dbg !17
 }
-declare !dbg !4 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed_addr #1
+declare !dbg !4 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed_addr
 
 ; CHECK:             .section        .BTF,"",@progbits
 ; CHECK-NEXT:        .short  60319                   # 0xeb9f
@@ -62,10 +62,6 @@ declare !dbg !4 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed
 ; CHECK-NEXT:        .ascii  "global_func"           # string offset=60
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
 !llvm.ident = !{!12}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-func.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-func.ll
index 262abb3..0ba4732 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-func.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-func.ll
@@ -10,9 +10,9 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm test.c
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test() local_unnamed_addr !dbg !13 {
 entry:
-  %call = tail call i32 @global_func(i8 signext 0) #2, !dbg !16
+  %call = tail call i32 @global_func(i8 signext 0), !dbg !16
   ret i32 %call, !dbg !17
 }
 
@@ -61,11 +61,7 @@ entry:
 ; CHECK-NEXT:        .ascii  "global_func"           # string offset=60
 ; CHECK-NEXT:        .byte   0
 
-declare !dbg !4 dso_local i32 @global_func(i8 signext) local_unnamed_addr #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
+declare !dbg !4 dso_local i32 @global_func(i8 signext) local_unnamed_addr
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-section.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-section.ll
index b6e14fc..27793d1 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-section.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-section.ll
@@ -13,9 +13,9 @@
 @ch = external dso_local local_unnamed_addr global i8, section "abc", align 1, !dbg !0
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !16 {
+define dso_local i32 @test() local_unnamed_addr !dbg !16 {
 entry:
-  %call = tail call i32 @global_func(i8 signext 0) #2, !dbg !19
+  %call = tail call i32 @global_func(i8 signext 0), !dbg !19
   %0 = load i8, ptr @ch, align 1, !dbg !20, !tbaa !21
   %conv = sext i8 %0 to i32, !dbg !20
   %add = add nsw i32 %call, %conv, !dbg !24
@@ -84,11 +84,7 @@ entry:
 ; CHECK-NEXT:        .ascii  "abc"                   # string offset=75
 ; CHECK-NEXT:        .byte   0
 
-declare !dbg !6 dso_local i32 @global_func(i8 signext) local_unnamed_addr #1 section "abc"
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
+declare !dbg !6 dso_local i32 @global_func(i8 signext) local_unnamed_addr section "abc"
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!12, !13, !14}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-struct-weak.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-struct-weak.ll
index 63ab578..ffec16b 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-struct-weak.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-struct-weak.ll
@@ -12,7 +12,7 @@
 
 @global = extern_weak dso_local local_unnamed_addr global %struct.t1, align 4, !dbg !0
 ; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @test() local_unnamed_addr !dbg !15 {
 entry:
   %0 = load i32, ptr @global, align 4, !dbg !18, !tbaa !19
   ret i32 %0, !dbg !24
@@ -68,8 +68,6 @@ entry:
 ; CHECK-NEXT:        .ascii  "global"                # string offset=66
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!11, !12, !13}
 !llvm.ident = !{!14}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-struct.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-struct.ll
index 3ecda4f..dfe5e5e 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-struct.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-struct.ll
@@ -13,7 +13,7 @@
 @global = external dso_local local_unnamed_addr global %struct.t1, align 4, !dbg !0
 
 ; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @test() local_unnamed_addr !dbg !15 {
 entry:
   %0 = load i32, ptr @global, align 4, !dbg !18, !tbaa !19
   ret i32 %0, !dbg !24
@@ -69,8 +69,6 @@ entry:
 ; CHECK-NEXT:        .ascii  "global"                # string offset=66
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!11, !12, !13}
 !llvm.ident = !{!14}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-weak-section.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-weak-section.ll
index 57ca18c..7d28987 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-weak-section.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-weak-section.ll
@@ -12,15 +12,15 @@
 
 @ch = extern_weak dso_local local_unnamed_addr global i8, section "abc", align 1, !dbg !0
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !16 {
+define dso_local i32 @test() local_unnamed_addr !dbg !16 {
 entry:
-  %call = tail call i32 @global_func(i8 signext 0) #2, !dbg !19
+  %call = tail call i32 @global_func(i8 signext 0), !dbg !19
   %0 = load i8, ptr @ch, align 1, !dbg !20, !tbaa !21
   %conv = sext i8 %0 to i32, !dbg !20
   %add = add nsw i32 %call, %conv, !dbg !24
   ret i32 %add, !dbg !25
 }
-declare !dbg !6 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed_addr #1 section "abc"
+declare !dbg !6 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed_addr section "abc"
 
 ; CHECK:             .section        .BTF,"",@progbits
 ; CHECK-NEXT:        .short  60319                   # 0xeb9f
@@ -84,10 +84,6 @@ declare !dbg !6 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed
 ; CHECK-NEXT:        .ascii  "abc"                   # string offset=75
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
-
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!12, !13, !14}
 !llvm.ident = !{!15}
diff --git a/llvm/test/CodeGen/BPF/BTF/filename.ll b/llvm/test/CodeGen/BPF/BTF/filename.ll
index ae08aea..0d8742fa 100644
--- a/llvm/test/CodeGen/BPF/BTF/filename.ll
+++ b/llvm/test/CodeGen/BPF/BTF/filename.ll
@@ -7,7 +7,7 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm t.c
 
 ; Function Attrs: norecurse nounwind readnone uwtable
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test() local_unnamed_addr !dbg !7 {
   ret i32 0, !dbg !11
 }
 
@@ -63,8 +63,6 @@ define dso_local i32 @test() local_unnamed_addr #0 !dbg !7 {
 ; CHECK-NEXT:        .long   0
 ; CHECK-NEXT:        .long   1038                    # Line 1 Col 14
 
-attributes #0 = { norecurse nounwind readnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
diff --git a/llvm/test/CodeGen/BPF/BTF/func-func-ptr.ll b/llvm/test/CodeGen/BPF/BTF/func-func-ptr.ll
index b700be9..f8c3de5 100644
--- a/llvm/test/CodeGen/BPF/BTF/func-func-ptr.ll
+++ b/llvm/test/CodeGen/BPF/BTF/func-func-ptr.ll
@@ -14,7 +14,7 @@
 @b1 = common dso_local local_unnamed_addr global %struct.t1 zeroinitializer, align 8, !dbg !6
 
 ; Function Attrs: nounwind readnone
-define dso_local void @f1(i32 %p2) local_unnamed_addr #0 !dbg !19 {
+define dso_local void @f1(i32 %p2) local_unnamed_addr !dbg !19 {
 entry:
   call void @llvm.dbg.value(metadata i32 %p2, metadata !21, metadata !DIExpression()), !dbg !22
   ret void, !dbg !23
@@ -95,10 +95,7 @@ entry:
 ; CHECK-NEXT:        .long   3091                    # Line 3 Col 19
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!15, !16, !17}
diff --git a/llvm/test/CodeGen/BPF/BTF/func-non-void.ll b/llvm/test/CodeGen/BPF/BTF/func-non-void.ll
index 2f562b2..745645d 100644
--- a/llvm/test/CodeGen/BPF/BTF/func-non-void.ll
+++ b/llvm/test/CodeGen/BPF/BTF/func-non-void.ll
@@ -7,7 +7,7 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm t.c
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @f1(i32 returned) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @f1(i32 returned) local_unnamed_addr !dbg !7 {
   call void @llvm.dbg.value(metadata i32 %0, metadata !12, metadata !DIExpression()), !dbg !13
   ret i32 %0, !dbg !14
 }
@@ -73,10 +73,7 @@ define dso_local i32 @f1(i32 returned) local_unnamed_addr #0 !dbg !7 {
 ; CHECK-NEXT:        .long   1042                    # Line 1 Col 18
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/func-source.ll b/llvm/test/CodeGen/BPF/BTF/func-source.ll
index a485d2c..c305e83 100644
--- a/llvm/test/CodeGen/BPF/BTF/func-source.ll
+++ b/llvm/test/CodeGen/BPF/BTF/func-source.ll
@@ -10,7 +10,7 @@
 ; correct reference to the lines in the string table.
 
 ; Function Attrs: norecurse nounwind readnone
-define dso_local void @f() local_unnamed_addr #0 !dbg !7 {
+define dso_local void @f() local_unnamed_addr !dbg !7 {
 entry:
   ret void, !dbg !10
 }
@@ -63,8 +63,6 @@ entry:
 ; CHECK-NEXT:        .long   18
 ; CHECK-NEXT:        .long   1040                    # Line 1 Col 16
 
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
diff --git a/llvm/test/CodeGen/BPF/BTF/func-typedef.ll b/llvm/test/CodeGen/BPF/BTF/func-typedef.ll
index 2570536..388deeb 100644
--- a/llvm/test/CodeGen/BPF/BTF/func-typedef.ll
+++ b/llvm/test/CodeGen/BPF/BTF/func-typedef.ll
@@ -9,7 +9,7 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm t.c
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @f(i32 returned %a) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @f(i32 returned %a) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata i32 %a, metadata !14, metadata !DIExpression()), !dbg !15
   ret i32 %a, !dbg !16
@@ -85,12 +85,8 @@ entry:
 ; CHECK-NEXT:        .long   0
 ; CHECK-NEXT:        .long   3092                    # Line 3 Col 20
 
-
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/func-unused-arg.ll b/llvm/test/CodeGen/BPF/BTF/func-unused-arg.ll
index f9439e6..380642c 100644
--- a/llvm/test/CodeGen/BPF/BTF/func-unused-arg.ll
+++ b/llvm/test/CodeGen/BPF/BTF/func-unused-arg.ll
@@ -7,7 +7,7 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm t.c
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @f1(i32) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @f1(i32) local_unnamed_addr !dbg !7 {
   call void @llvm.dbg.value(metadata i32 %0, metadata !12, metadata !DIExpression()), !dbg !13
   ret i32 0, !dbg !14
 }
@@ -69,10 +69,7 @@ define dso_local i32 @f1(i32) local_unnamed_addr #0 !dbg !7 {
 ; CHECK-NEXT:        .long   1042                    # Line 1 Col 18
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/func-void.ll b/llvm/test/CodeGen/BPF/BTF/func-void.ll
index bf70b6a..9205700 100644
--- a/llvm/test/CodeGen/BPF/BTF/func-void.ll
+++ b/llvm/test/CodeGen/BPF/BTF/func-void.ll
@@ -7,7 +7,7 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm t.c
 
 ; Function Attrs: norecurse nounwind readnone
-define dso_local void @f1() local_unnamed_addr #0 !dbg !7 {
+define dso_local void @f1() local_unnamed_addr !dbg !7 {
   ret void, !dbg !10
 }
 
@@ -57,8 +57,6 @@ define dso_local void @f1() local_unnamed_addr #0 !dbg !7 {
 ; CHECK-NEXT:        .long   0
 ; CHECK-NEXT:        .long   1040                    # Line 1 Col 16
 
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
diff --git a/llvm/test/CodeGen/BPF/BTF/local-var-readonly-1.ll b/llvm/test/CodeGen/BPF/BTF/local-var-readonly-1.ll
index 6ef7a30..5c797f7 100644
--- a/llvm/test/CodeGen/BPF/BTF/local-var-readonly-1.ll
+++ b/llvm/test/CodeGen/BPF/BTF/local-var-readonly-1.ll
@@ -21,16 +21,16 @@
 @__const.test.val = private unnamed_addr constant %struct.anon { [4 x i32] [i32 2, i32 3, i32 4, i32 5] }, align 4
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test() local_unnamed_addr !dbg !7 {
 entry:
   %val = alloca %struct.anon, align 4
   call void @llvm.dbg.value(metadata ptr @.str, metadata !12, metadata !DIExpression()), !dbg !25
-  call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %val) #4, !dbg !26
+  call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %val), !dbg !26
   call void @llvm.dbg.declare(metadata ptr %val, metadata !16, metadata !DIExpression()), !dbg !27
   call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 4 dereferenceable(16) %val, ptr nonnull align 4 dereferenceable(16) @__const.test.val, i64 16, i1 false), !dbg !27
-  tail call void @foo(ptr @.str) #4, !dbg !28
-  call void @foo(ptr nonnull %val) #4, !dbg !29
-  call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %val) #4, !dbg !30
+  tail call void @foo(ptr @.str), !dbg !28
+  call void @foo(ptr nonnull %val), !dbg !29
+  call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %val), !dbg !30
   ret i32 0, !dbg !31
 }
 
@@ -39,27 +39,21 @@ entry:
 ; CHECK-NOT:   BTF_KIND_DATASEC
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #1
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
 
-declare !dbg !32 dso_local void @foo(ptr) local_unnamed_addr #3
+declare !dbg !32 dso_local void @foo(ptr) local_unnamed_addr
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { nounwind readnone speculatable willreturn }
-attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/local-var-readonly-2.ll b/llvm/test/CodeGen/BPF/BTF/local-var-readonly-2.ll
index 0e183a5..243cd87 100644
--- a/llvm/test/CodeGen/BPF/BTF/local-var-readonly-2.ll
+++ b/llvm/test/CodeGen/BPF/BTF/local-var-readonly-2.ll
@@ -19,14 +19,14 @@
 @__const.test.val = private unnamed_addr constant %struct.anon { [4 x i32] [i32 2, i32 3, i32 4, i32 5], i8 4 }, align 4
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test() local_unnamed_addr !dbg !7 {
 entry:
   %val = alloca %struct.anon, align 4
-  call void @llvm.lifetime.start.p0(i64 20, ptr nonnull %val) #4, !dbg !23
+  call void @llvm.lifetime.start.p0(i64 20, ptr nonnull %val), !dbg !23
   call void @llvm.dbg.declare(metadata ptr %val, metadata !12, metadata !DIExpression()), !dbg !24
   call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 4 dereferenceable(20) %val, ptr nonnull align 4 dereferenceable(20) @__const.test.val, i64 20, i1 false), !dbg !24
-  call void @foo(ptr nonnull %val) #4, !dbg !25
-  call void @llvm.lifetime.end.p0(i64 20, ptr nonnull %val) #4, !dbg !26
+  call void @foo(ptr nonnull %val), !dbg !25
+  call void @llvm.lifetime.end.p0(i64 20, ptr nonnull %val), !dbg !26
   ret i32 0, !dbg !27
 }
 
@@ -38,24 +38,18 @@ entry:
 ; CHECK:             .ascii  ".rodata"                       # string offset=42
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #1
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
 
-declare !dbg !28 dso_local void @foo(ptr) local_unnamed_addr #3
+declare !dbg !28 dso_local void @foo(ptr) local_unnamed_addr
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { nounwind readnone speculatable willreturn }
-attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind }
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/local-var.ll b/llvm/test/CodeGen/BPF/BTF/local-var.ll
index dd79923..fa605d8 100644
--- a/llvm/test/CodeGen/BPF/BTF/local-var.ll
+++ b/llvm/test/CodeGen/BPF/BTF/local-var.ll
@@ -7,7 +7,7 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm test.c
 
 ; Function Attrs: nounwind
-define dso_local i32 @foo(i8 signext) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @foo(i8 signext) local_unnamed_addr !dbg !7 {
   %2 = alloca i16, align 2
   call void @llvm.dbg.value(metadata i8 %0, metadata !13, metadata !DIExpression()), !dbg !17
   call void @llvm.lifetime.start.p0(i64 2, ptr nonnull %2), !dbg !18
@@ -59,20 +59,16 @@ define dso_local i32 @foo(i8 signext) local_unnamed_addr #0 !dbg !7 {
 ; CHECK-NEXT:        .byte   0
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #2
+declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #2
+declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
-attributes #2 = { argmemonly nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/pruning-const.ll b/llvm/test/CodeGen/BPF/BTF/pruning-const.ll
index 8fef9c2..733815d 100644
--- a/llvm/test/CodeGen/BPF/BTF/pruning-const.ll
+++ b/llvm/test/CodeGen/BPF/BTF/pruning-const.ll
@@ -22,14 +22,14 @@
 %struct.s2 = type { %struct.tt }
 
 ; Function Attrs: norecurse nounwind readnone
-define dso_local i32 @test1(ptr nocapture readnone %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test1(ptr nocapture readnone %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !22, metadata !DIExpression()), !dbg !23
   ret i32 0, !dbg !24
 }
 
 ; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @test2(ptr nocapture readonly %arg) local_unnamed_addr #1 !dbg !25 {
+define dso_local i32 @test2(ptr nocapture readonly %arg) local_unnamed_addr !dbg !25 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !33, metadata !DIExpression()), !dbg !34
   %0 = load i32, ptr %arg, align 4, !dbg !35, !tbaa !36
@@ -64,11 +64,7 @@ entry:
 ; CHECK:        .ascii  "m2"                    # string offset=72
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/pruning-typedef.ll b/llvm/test/CodeGen/BPF/BTF/pruning-typedef.ll
index 4c8aa1f..727daea 100644
--- a/llvm/test/CodeGen/BPF/BTF/pruning-typedef.ll
+++ b/llvm/test/CodeGen/BPF/BTF/pruning-typedef.ll
@@ -24,14 +24,14 @@
 %struct.s2 = type { %struct.tt }
 
 ; Function Attrs: norecurse nounwind readnone
-define dso_local i32 @test1(ptr nocapture readnone %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test1(ptr nocapture readnone %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !23, metadata !DIExpression()), !dbg !24
   ret i32 0, !dbg !25
 }
 
 ; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @test2(ptr nocapture readonly %arg) local_unnamed_addr #1 !dbg !26 {
+define dso_local i32 @test2(ptr nocapture readonly %arg) local_unnamed_addr !dbg !26 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !34, metadata !DIExpression()), !dbg !35
   %0 = load i32, ptr %arg, align 4, !dbg !36, !tbaa !37
@@ -71,11 +71,7 @@ entry:
 ; CHECK:        .ascii  "m2"                    # string offset=81
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/ptr-named-2.ll b/llvm/test/CodeGen/BPF/BTF/ptr-named-2.ll
new file mode 100644
index 0000000..df0cbeb
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/BTF/ptr-named-2.ll
@@ -0,0 +1,59 @@
+; RUN: llc -mtriple=bpfel -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
+; RUN: llc -mtriple=bpfeb -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
+;
+; This IR is hand-written.
+
+; ModuleID = 'ptr-named-2.ll'
+source_filename = "ptr-named-2.ll"
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "bpfel-unknown-none"
+
+%struct.TypeExamples = type { i32*, i32, i32, i32* }
+
+@type_examples = internal global %struct.TypeExamples zeroinitializer, align 8, !dbg !0
+
+!llvm.dbg.cu = !{!1}
+!llvm.module.flags = !{!2, !3, !4}
+!llvm.ident = !{!21}
+
+; CHECK-BTF:      [1] STRUCT 'TypeExamples' size=32 vlen=4
+; CHECK-BTF-NEXT:         'ptr' type_id=2 bits_offset=0
+; CHECK-BTF-NEXT:         'volatile' type_id=4 bits_offset=64
+; CHECK-BTF-NEXT:         'const' type_id=5 bits_offset=128
+; CHECK-BTF-NEXT:         'restrict_ptr' type_id=6 bits_offset=192
+; CHECK-BTF-NEXT: [2] PTR '(anon)' type_id=3
+; CHECK-BTF-NEXT: [3] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED
+; CHECK-BTF-NEXT: [4] VOLATILE '(anon)' type_id=3
+; CHECK-BTF-NEXT: [5] CONST '(anon)' type_id=3
+; CHECK-BTF-NEXT: [6] RESTRICT '(anon)' type_id=7
+; CHECK-BTF-NEXT: [7] PTR '(anon)' type_id=3
+; CHECK-BTF-NEXT: [8] VAR 'type_examples' type_id=1, linkage=static
+; CHECK-BTF-NEXT: [9] DATASEC '.bss' size=0 vlen=1
+; CHECK-BTF-NEXT:         type_id=8 offset=0 size=24
+
+!0 = !DIGlobalVariableExpression(var: !5, expr: !DIExpression())
+!1 = distinct !DICompileUnit(language: DW_LANG_C99, file: !6, isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !7, globals: !8, splitDebugInlining: false, nameTableKind: None)
+!2 = !{i32 2, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = distinct !DIGlobalVariable(name: "type_examples", scope: !1, file: !6, line: 12, type: !9, isLocal: true, isDefinition: true)
+!6 = !DIFile(filename: "ptr-named-2.ll", directory: "/tmp")
+!7 = !{}
+!8 = !{!0}
+!9 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "TypeExamples", file: !6, line: 5, size: 256, elements: !10)
+!10 = !{!11, !12, !13, !14}
+!11 = !DIDerivedType(tag: DW_TAG_member, name: "ptr", scope: !9, file: !6, line: 6, baseType: !15, size: 64)
+!12 = !DIDerivedType(tag: DW_TAG_member, name: "volatile", scope: !9, file: !6, line: 7, baseType: !17, size: 64, offset: 64)
+!13 = !DIDerivedType(tag: DW_TAG_member, name: "const", scope: !9, file: !6, line: 8, baseType: !18, size: 64, offset: 128)
+!14 = !DIDerivedType(tag: DW_TAG_member, name: "restrict_ptr", scope: !9, file: !6, line: 9, baseType: !19, size: 64, offset: 192)
+!15 = !DIDerivedType(tag: DW_TAG_pointer_type, name: "*int", baseType: !16, size: 64)
+!16 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!17 = !DIDerivedType(tag: DW_TAG_volatile_type, name: "volatile int", baseType: !16)
+!18 = !DIDerivedType(tag: DW_TAG_const_type, name: "const int", baseType: !16)
+!19 = !DIDerivedType(tag: DW_TAG_restrict_type, name: "*int restrict", baseType: !20)
+!20 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !16, size: 64)
+!21 = !{!"my hand-written IR"}
diff --git a/llvm/test/CodeGen/BPF/BTF/ptr-named.ll b/llvm/test/CodeGen/BPF/BTF/ptr-named.ll
new file mode 100644
index 0000000..675c34e
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/BTF/ptr-named.ll
@@ -0,0 +1,75 @@
+; RUN: llc -mtriple=bpfel -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
+; RUN: llc -mtriple=bpfeb -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
+;
+; Source:
+;   #![no_std]
+;   #![no_main]
+;
+;   pub struct MyType {
+;       ptr: *const u32,
+;   }
+;
+;   impl MyType {
+;       pub const fn new() -> Self {
+;           let ptr = core::ptr::null();
+;           Self { ptr }
+;       }
+;   }
+;
+;   unsafe impl Sync for MyType {}
+;
+;   #[unsafe(no_mangle)]
+;   pub static X: MyType = MyType::new();
+;
+;   #[cfg(not(test))]
+;   #[panic_handler]
+;   fn panic(_info: &core::panic::PanicInfo) -> ! {
+;       loop {}
+;   }
+; Compilation flag:
+;   cargo +nightly rustc -Zbuild-std=core --target=bpfel-unknown-none -- --emit=llvm-bc
+;   llvm-extract --glob=X $(find target/ -name "*.bc" | head -n 1) -o ptr-named.bc
+;   llvm-dis ptr-named.bc -o ptr-named.ll
+
+; ModuleID = 'ptr-named.bc'
+source_filename = "1m2uqe50qkwxmo53ydydvou91"
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "bpfel"
+
+@X = constant [8 x i8] zeroinitializer, align 8, !dbg !0
+
+!llvm.module.flags = !{!11, !12, !13, !14}
+!llvm.ident = !{!15}
+!llvm.dbg.cu = !{!16}
+
+; CHECK-BTF:      [1] STRUCT 'MyType' size=8 vlen=1
+; CHECK-BTF-NEXT:         'ptr' type_id=2 bits_offset=0
+; CHECK-BTF-NEXT: [2] PTR '(anon)' type_id=3
+; CHECK-BTF-NEXT: [3] INT 'u32' size=4 bits_offset=0 nr_bits=32 encoding=(none)
+; CHECK-BTF-NEXT: [4] VAR 'X' type_id=1, linkage=global
+; CHECK-BTF-NEXT: [5] DATASEC '.rodata' size=0 vlen=1
+; CHECK-BTF-NEXT:         type_id=4 offset=0 size=8
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "X", scope: !2, file: !3, line: 19, type: !4, isLocal: false, isDefinition: true, align: 64)
+!2 = !DINamespace(name: "ptr_named", scope: null)
+!3 = !DIFile(filename: "ptr-named/src/main.rs", directory: "/tmp/ptr-named", checksumkind: CSK_MD5, checksum: "e37168304600b30cbb5ba168f0384932")
+!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "MyType", scope: !2, file: !5, size: 64, align: 64, flags: DIFlagPublic, elements: !6, templateParams: !10, identifier: "7609fa40332dd486922f074276a171c3")
+!5 = !DIFile(filename: "<unknown>", directory: "")
+!6 = !{!7}
+!7 = !DIDerivedType(tag: DW_TAG_member, name: "ptr", scope: !4, file: !5, baseType: !8, size: 64, align: 64, flags: DIFlagPrivate)
+!8 = !DIDerivedType(tag: DW_TAG_pointer_type, name: "*const u32", baseType: !9, size: 64, align: 64, dwarfAddressSpace: 0)
+!9 = !DIBasicType(name: "u32", size: 32, encoding: DW_ATE_unsigned)
+!10 = !{}
+!11 = !{i32 8, !"PIC Level", i32 2}
+!12 = !{i32 7, !"PIE Level", i32 2}
+!13 = !{i32 7, !"Dwarf Version", i32 4}
+!14 = !{i32 2, !"Debug Info Version", i32 3}
+!15 = !{!"rustc version 1.92.0-nightly (c8905eaa6 2025-09-28)"}
+!16 = distinct !DICompileUnit(language: DW_LANG_Rust, file: !17, producer: "clang LLVM (rustc version 1.92.0-nightly (c8905eaa6 2025-09-28))", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !18, splitDebugInlining: false, nameTableKind: None)
+!17 = !DIFile(filename: "ptr-named/src/main.rs/@/1m2uqe50qkwxmo53ydydvou91", directory: "/tmp/ptr-named")
+!18 = !{!0}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-func.ll b/llvm/test/CodeGen/BPF/BTF/static-func.ll
index fc79dbf..6506407 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-func.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-func.ll
@@ -9,18 +9,18 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm test.c
 
 ; Function Attrs: nounwind
-define dso_local i32 @test2() local_unnamed_addr #0 !dbg !12 {
+define dso_local i32 @test2() local_unnamed_addr !dbg !12 {
 entry:
   %call = tail call fastcc i32 @test1(), !dbg !13
   ret i32 %call, !dbg !14
 }
 ; Function Attrs: noinline nounwind
-define internal fastcc i32 @test1() unnamed_addr #1 !dbg !15 {
+define internal fastcc i32 @test1() unnamed_addr !dbg !15 {
 entry:
-  %call = tail call i32 @foo() #3, !dbg !16
+  %call = tail call i32 @foo(), !dbg !16
   ret i32 %call, !dbg !17
 }
-declare !dbg !4 dso_local i32 @foo() local_unnamed_addr #2
+declare !dbg !4 dso_local i32 @foo() local_unnamed_addr
 
 ; CHECK:             .section        .BTF,"",@progbits
 ; CHECK-NEXT:        .short  60319                   # 0xeb9f
@@ -67,11 +67,6 @@ declare !dbg !4 dso_local i32 @foo() local_unnamed_addr #2
 ; CHECK-NEXT:        .ascii  "foo"                   # string offset=60
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !9, !10}
 !llvm.ident = !{!11}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var-derived-type.ll b/llvm/test/CodeGen/BPF/BTF/static-var-derived-type.ll
index 1827c97..fedec38 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var-derived-type.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var-derived-type.ll
@@ -17,7 +17,7 @@
 @v4 = internal constant ptr null, align 8, !dbg !19
 
 ; Function Attrs: norecurse nounwind
-define dso_local i64 @foo() local_unnamed_addr #0 !dbg !27 {
+define dso_local i64 @foo() local_unnamed_addr !dbg !27 {
   %1 = load volatile ptr, ptr @v1, align 8, !dbg !29, !tbaa !30
   %2 = load volatile ptr, ptr @v2, align 8, !dbg !34, !tbaa !30
   %3 = ptrtoint ptr %1 to i64, !dbg !35
@@ -141,8 +141,6 @@ define dso_local i64 @foo() local_unnamed_addr #0 !dbg !27 {
 ; CHECK-NEXT:        .ascii  ".rodata"               # string offset=87
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!23, !24, !25}
 !llvm.ident = !{!26}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var-inited-sec.ll b/llvm/test/CodeGen/BPF/BTF/static-var-inited-sec.ll
index cc785b7..deef48a 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var-inited-sec.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var-inited-sec.ll
@@ -14,7 +14,7 @@
 @a = internal global i8 3, section "maps", align 1, !dbg !10
 
 ; Function Attrs: norecurse nounwind
-define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
+define dso_local i32 @foo() local_unnamed_addr !dbg !2 {
   %1 = load volatile i8, ptr @a, align 1, !dbg !20, !tbaa !21
   %2 = sext i8 %1 to i32, !dbg !20
   %3 = load volatile i16, ptr @foo.b, align 2, !dbg !24, !tbaa !25
@@ -93,8 +93,6 @@ define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
 ; CHECK-NEXT:        .ascii  "maps"                  # string offset=71
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!7}
 !llvm.module.flags = !{!16, !17, !18}
 !llvm.ident = !{!19}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var-inited.ll b/llvm/test/CodeGen/BPF/BTF/static-var-inited.ll
index 2b62882..8f29a83 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var-inited.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var-inited.ll
@@ -14,7 +14,7 @@
 @a = internal global i8 3, align 1, !dbg !10
 
 ; Function Attrs: norecurse nounwind
-define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
+define dso_local i32 @foo() local_unnamed_addr !dbg !2 {
   %1 = load volatile i8, ptr @a, align 1, !dbg !20, !tbaa !21
   %2 = sext i8 %1 to i32, !dbg !20
   %3 = load volatile i16, ptr @foo.b, align 2, !dbg !24, !tbaa !25
@@ -93,8 +93,6 @@ define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
 ; CHECK-NEXT:        .ascii  ".data"                 # string offset=71
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!7}
 !llvm.module.flags = !{!16, !17, !18}
 !llvm.ident = !{!19}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var-readonly-sec.ll b/llvm/test/CodeGen/BPF/BTF/static-var-readonly-sec.ll
index a4ae948..e16b467 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var-readonly-sec.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var-readonly-sec.ll
@@ -14,7 +14,7 @@
 @a = internal constant i8 0, section "maps", align 1, !dbg !10
 
 ; Function Attrs: norecurse nounwind
-define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
+define dso_local i32 @foo() local_unnamed_addr !dbg !2 {
   %1 = load volatile i8, ptr @a, align 1, !dbg !22, !tbaa !23
   %2 = sext i8 %1 to i32, !dbg !22
   %3 = load volatile i16, ptr @foo.b, align 2, !dbg !26, !tbaa !27
@@ -99,8 +99,6 @@ define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
 ; CHECK-NEXT:        .ascii  "maps"                  # string offset=71
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!7}
 !llvm.module.flags = !{!18, !19, !20}
 !llvm.ident = !{!21}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var-readonly.ll b/llvm/test/CodeGen/BPF/BTF/static-var-readonly.ll
index a9d60ce..1ddd499 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var-readonly.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var-readonly.ll
@@ -14,7 +14,7 @@
 @a = internal constant i8 0, align 1, !dbg !10
 
 ; Function Attrs: norecurse nounwind
-define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
+define dso_local i32 @foo() local_unnamed_addr !dbg !2 {
   %1 = load volatile i8, ptr @a, align 1, !dbg !22, !tbaa !23
   %2 = sext i8 %1 to i32, !dbg !22
   %3 = load volatile i16, ptr @foo.b, align 2, !dbg !26, !tbaa !27
@@ -99,8 +99,6 @@ define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
 ; CHECK-NEXT:        .ascii  ".rodata"               # string offset=71
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!7}
 !llvm.module.flags = !{!18, !19, !20}
 !llvm.ident = !{!21}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var-sec.ll b/llvm/test/CodeGen/BPF/BTF/static-var-sec.ll
index ac27b2b..0ff8f2e 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var-sec.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var-sec.ll
@@ -14,7 +14,7 @@
 @a = internal global i8 0, section "maps", align 1, !dbg !10
 
 ; Function Attrs: norecurse nounwind
-define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
+define dso_local i32 @foo() local_unnamed_addr !dbg !2 {
   %1 = load volatile i8, ptr @a, align 1, !dbg !20, !tbaa !21
   %2 = sext i8 %1 to i32, !dbg !20
   %3 = load volatile i16, ptr @foo.b, align 2, !dbg !24, !tbaa !25
@@ -93,8 +93,6 @@ define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
 ; CHECK-NEXT:        .ascii  "maps"                  # string offset=71
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!7}
 !llvm.module.flags = !{!16, !17, !18}
 !llvm.ident = !{!19}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var-zerolen-array.ll b/llvm/test/CodeGen/BPF/BTF/static-var-zerolen-array.ll
index 28da203..fe9f508 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var-zerolen-array.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var-zerolen-array.ll
@@ -15,7 +15,7 @@
 @sv = internal global { i32, i32, [10 x i8] } { i32 3, i32 4, [10 x i8] c"abcdefghi\00" }, align 4, !dbg !0
 
 ; Function Attrs: norecurse nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !21 {
+define dso_local i32 @test() local_unnamed_addr !dbg !21 {
   %1 = load volatile i32, ptr @sv, align 4, !dbg !24, !tbaa !25
   ret i32 %1, !dbg !29
 }
@@ -104,8 +104,6 @@ define dso_local i32 @test() local_unnamed_addr #0 !dbg !21 {
 ; CHECK-NEXT:        .ascii  ".data"                 # string offset=89
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!17, !18, !19}
 !llvm.ident = !{!20}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var.ll b/llvm/test/CodeGen/BPF/BTF/static-var.ll
index 461bd27..f7710e3 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var.ll
@@ -14,7 +14,7 @@
 @a = internal global i8 0, align 1, !dbg !10
 
 ; Function Attrs: norecurse nounwind
-define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
+define dso_local i32 @foo() local_unnamed_addr !dbg !2 {
   %1 = load volatile i8, ptr @a, align 1, !dbg !20, !tbaa !21
   %2 = sext i8 %1 to i32, !dbg !20
   %3 = load volatile i16, ptr @foo.b, align 2, !dbg !24, !tbaa !25
@@ -93,8 +93,6 @@ define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
 ; CHECK-NEXT:        .ascii  ".bss"                  # string offset=71
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!7}
 !llvm.module.flags = !{!16, !17, !18}
 !llvm.ident = !{!19}
diff --git a/llvm/test/CodeGen/BPF/BTF/struct-anon-2.ll b/llvm/test/CodeGen/BPF/BTF/struct-anon-2.ll
index 5b125ea..68d4be0 100644
--- a/llvm/test/CodeGen/BPF/BTF/struct-anon-2.ll
+++ b/llvm/test/CodeGen/BPF/BTF/struct-anon-2.ll
@@ -15,7 +15,7 @@
 %struct.anon.0 = type { i64 }
 
 ; Function Attrs: norecurse nounwind readnone
-define dso_local i32 @f1(ptr nocapture readnone %s1) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @f1(ptr nocapture readnone %s1) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %s1, metadata !25, metadata !DIExpression()), !dbg !26
   ret i32 0, !dbg !27
@@ -65,12 +65,8 @@ entry:
 ; CHECK:             .ascii  "B1"                    # string offset=17
 ; CHECK:             .ascii  "long int"              # string offset=20
 
-
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/weak-global-2.ll b/llvm/test/CodeGen/BPF/BTF/weak-global-2.ll
index 4b3b557..14cb8e0 100644
--- a/llvm/test/CodeGen/BPF/BTF/weak-global-2.ll
+++ b/llvm/test/CodeGen/BPF/BTF/weak-global-2.ll
@@ -11,7 +11,7 @@
 
 @g = weak dso_local local_unnamed_addr global i8 2, align 1, !dbg !0
 ; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test() local_unnamed_addr !dbg !11 {
 entry:
   %0 = load i8, ptr @g, align 1, !dbg !15, !tbaa !16
   %conv = sext i8 %0 to i32, !dbg !15
@@ -37,9 +37,6 @@ entry:
 ; CHECK:             .byte   103                     # string offset=60
 ; CHECK:             .ascii  ".data"                 # string offset=62
 
-
-attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!7, !8, !9}
 !llvm.ident = !{!10}
diff --git a/llvm/test/CodeGen/BPF/BTF/weak-global.ll b/llvm/test/CodeGen/BPF/BTF/weak-global.ll
index ea0a887..5605e0b 100644
--- a/llvm/test/CodeGen/BPF/BTF/weak-global.ll
+++ b/llvm/test/CodeGen/BPF/BTF/weak-global.ll
@@ -11,7 +11,7 @@
 
 @g = weak dso_local local_unnamed_addr global i8 0, align 1, !dbg !0
 ; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test() local_unnamed_addr !dbg !11 {
 entry:
   %0 = load i8, ptr @g, align 1, !dbg !15, !tbaa !16
   %conv = sext i8 %0 to i32, !dbg !15
@@ -37,8 +37,6 @@ entry:
 ; CHECK:             .byte   103                     # string offset=60
 ; CHECK:             .ascii  ".bss"                  # string offset=62
 
-attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!7, !8, !9}
 !llvm.ident = !{!10}
diff --git a/llvm/test/CodeGen/BPF/CORE/btf-id-duplicate.ll b/llvm/test/CodeGen/BPF/CORE/btf-id-duplicate.ll
index 23a4617..eecb993 100644
--- a/llvm/test/CodeGen/BPF/CORE/btf-id-duplicate.ll
+++ b/llvm/test/CodeGen/BPF/CORE/btf-id-duplicate.ll
@@ -13,7 +13,7 @@
 %struct.s1 = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @foo(ptr %arg) #0 !dbg !7 {
+define dso_local i32 @foo(ptr %arg) !dbg !7 {
 entry:
   %arg.addr = alloca ptr, align 8
   store ptr %arg, ptr %arg.addr, align 8, !tbaa !18
@@ -24,13 +24,13 @@ entry:
 }
 
 ; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: nounwind readnone
-declare i64 @llvm.bpf.btf.type.id(i32, i64) #2
+declare i64 @llvm.bpf.btf.type.id(i32, i64)
 
 ; Function Attrs: nounwind
-define dso_local i32 @bar(ptr %arg) #0 !dbg !25 {
+define dso_local i32 @bar(ptr %arg) !dbg !25 {
 entry:
   %arg.addr = alloca ptr, align 8
   store ptr %arg, ptr %arg.addr, align 8, !tbaa !18
@@ -58,10 +58,6 @@ entry:
 ; CHECK-NEXT:        .long   26
 ; CHECK-NEXT:        .long   6
 
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nofree nosync nounwind readnone speculatable willreturn }
-attributes #2 = { nounwind readnone }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-alu32.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-alu32.ll
index 40a2432..0851f25 100644
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-alu32.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-alu32.ll
@@ -15,7 +15,7 @@ target triple = "bpf"
 @c = common dso_local global %struct.b zeroinitializer, align 4, !dbg !0
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @f() local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @f() local_unnamed_addr !dbg !15 {
 entry:
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.bs(ptr elementtype(%struct.b) nonnull @c, i32 1, i32 1), !dbg !18, !llvm.preserve.access.index !6
   %1 = tail call i32 @llvm.bpf.preserve.field.info.p0(ptr %0, i64 0), !dbg !19
@@ -40,13 +40,10 @@ entry:
 ; CHECK-NEXT:        .long   0
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.bs(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.bs(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll
index b8b7a0b..51df39b 100644
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll
@@ -25,7 +25,7 @@ target triple = "bpfeb"
 %struct.s = type { i64, i32, i32, i32, i8, i8 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !13 {
 ; CHECK-ALU64-LABEL: test:
 ; CHECK-ALU64:       .Ltest$local:
 ; CHECK-ALU64-NEXT:    .type .Ltest$local,@function
@@ -122,17 +122,13 @@ entry:
 ; CHECK-NEXT:        .long   4
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
@@ -177,4 +173,3 @@ attributes #2 = { nounwind readnone speculatable }
 !36 = !DILocation(line: 14, column: 10, scope: !13)
 !37 = !DILocation(line: 13, column: 67, scope: !13)
 !38 = !DILocation(line: 12, column: 3, scope: !13)
-
diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll
index 4cf0a13..295c105 100644
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll
@@ -25,7 +25,7 @@ target triple = "bpfel"
 %struct.s = type { i64, i32, i32, i32, i8, i8 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !13 {
 ; CHECK-ALU64-LABEL: test:
 ; CHECK-ALU64:       .Ltest$local:
 ; CHECK-ALU64-NEXT:    .type .Ltest$local,@function
@@ -122,17 +122,13 @@ entry:
 ; CHECK-NEXT:        .long   4
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2-bpfeb.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2-bpfeb.ll
index cdcd7e6..8f83404 100644
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2-bpfeb.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2-bpfeb.ll
@@ -26,7 +26,7 @@ target triple = "bpfeb"
 %struct.s = type <{ i8, i16 }>
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !13 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !27, metadata !DIExpression()), !dbg !28
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 4), !dbg !29, !llvm.preserve.access.index !18
@@ -70,17 +70,13 @@ entry:
 ; CHECK-NEXT:        .long   4
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2.ll
index dd7f1c7..1a7619a 100644
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2.ll
@@ -26,7 +26,7 @@ target triple = "bpfel"
 %struct.s = type <{ i8, i16 }>
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !13 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !27, metadata !DIExpression()), !dbg !28
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 4), !dbg !29, !llvm.preserve.access.index !18
@@ -70,17 +70,13 @@ entry:
 ; CHECK-NEXT:        .long   4
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-duplicate.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-duplicate.ll
index 126bd0a..5a98b05 100644
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-duplicate.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-duplicate.ll
@@ -13,7 +13,7 @@
 %struct.s1 = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @foo(ptr %arg) #0 !dbg !7 {
+define dso_local i32 @foo(ptr %arg) !dbg !7 {
 entry:
   %arg.addr = alloca ptr, align 8
   store ptr %arg, ptr %arg.addr, align 8, !tbaa !18
@@ -25,13 +25,13 @@ entry:
 }
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind
-define dso_local i32 @bar(ptr %arg) #0 !dbg !29 {
+define dso_local i32 @bar(ptr %arg) !dbg !29 {
 entry:
   %arg.addr = alloca ptr, align 8
   store ptr %arg, ptr %arg.addr, align 8, !tbaa !18
@@ -60,10 +60,6 @@ entry:
 ; CHECK-NEXT:        .long   26
 ; CHECK-NEXT:        .long   0
 
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable willreturn }
-attributes #2 = { nounwind readnone }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-array-2.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-array-2.ll
index 90681d3c..00c3a6d 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-array-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-array-2.ll
@@ -17,7 +17,7 @@ target triple = "bpf"
 %struct.s1 = type { i32 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !17 {
+define dso_local i32 @test() local_unnamed_addr !dbg !17 {
 entry:
   call void @llvm.dbg.value(metadata ptr null, metadata !21, metadata !DIExpression()), !dbg !22
   %0 = tail call ptr @llvm.preserve.array.access.index.p0.s1s.p0.s1s(ptr elementtype(%struct.s1) null, i32 0, i32 0), !dbg !23, !llvm.preserve.access.index !8
@@ -40,17 +40,13 @@ entry:
 ; CHECK-NEXT:        .long   2
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.s1s.p0.s1s(ptr, i32 immarg, i32 immarg) #1
+declare ptr @llvm.preserve.array.access.index.p0.s1s.p0.s1s(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0.s1s(ptr, i64 immarg) #1
+declare i32 @llvm.bpf.preserve.field.info.p0.s1s(ptr, i64 immarg)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!13, !14, !15}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-array.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-array.ll
index d6bed6c..7e2e8e6 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-array.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-array.ll
@@ -15,12 +15,12 @@ target triple = "bpf"
 %struct.s = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !17, metadata !DIExpression()), !dbg !18
   %0 = tail call ptr @llvm.preserve.array.access.index.p0.ss.p0.ss(ptr elementtype(%struct.s) %arg, i32 0, i32 2), !dbg !19, !llvm.preserve.access.index !11
   %1 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %0, i32 1, i32 1), !dbg !19, !llvm.preserve.access.index !12
-  %call = tail call i32 @get_value(ptr %1) #4, !dbg !20
+  %call = tail call i32 @get_value(ptr %1), !dbg !20
   ret i32 %call, !dbg !21
 }
 ; CHECK-LABEL: test
@@ -39,22 +39,16 @@ entry:
 ; CHECK-NEXT: .long   26
 ; CHECK-NEXT: .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.ss.p0.ss(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.array.access.index.p0.ss.p0.ss(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-1.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-1.ll
index 525f38d..cb6674f 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-1.ll
@@ -22,7 +22,7 @@ target triple = "bpf"
 %struct.s1 = type { i32 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !28, metadata !DIExpression()), !dbg !33
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !34, !llvm.preserve.access.index !16
@@ -85,20 +85,16 @@ entry:
 ; CHECK-NEXT:        .long   1
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-2.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-2.ll
index 11235b5..2697201 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-2.ll
@@ -21,7 +21,7 @@ target triple = "bpf"
 %struct.s1 = type { i32, i8 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !27, metadata !DIExpression()), !dbg !31
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !32, !llvm.preserve.access.index !16
@@ -71,27 +71,23 @@ entry:
 ; CHECK-NEXT:        .long   1
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0.s1s(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0.s1s(ptr, i64)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-3.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-3.ll
index e3382d6..b7541f0 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-3.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-3.ll
@@ -20,7 +20,7 @@ target triple = "bpf"
 %struct.s1 = type { [10 x [10 x i32]] }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !18 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !18 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !31, metadata !DIExpression()), !dbg !34
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !35, !llvm.preserve.access.index !22
@@ -60,27 +60,23 @@ entry:
 ; CHECK-NEXT:        .long   1
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #1
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-4.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-4.ll
index fda7592..0220567 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-4.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-4.ll
@@ -15,7 +15,7 @@ target triple = "bpf"
 %struct.s1 = type { i32, i8, i32 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr readnone %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr readnone %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !23, metadata !DIExpression()), !dbg !24
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr elementtype(%struct.s1) %arg, i32 1, i32 1), !dbg !25, !llvm.preserve.access.index !17
@@ -41,17 +41,13 @@ entry:
 ; CHECK-NEXT:        .long   1
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-1.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-1.ll
index 69872db3..0404deb 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-1.ll
@@ -22,7 +22,7 @@ target triple = "bpf"
 %union.u1 = type { i32 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg1, ptr %arg2) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg1, ptr %arg2) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg1, metadata !29, metadata !DIExpression()), !dbg !35
   call void @llvm.dbg.value(metadata ptr %arg2, metadata !30, metadata !DIExpression()), !dbg !35
@@ -85,29 +85,25 @@ entry:
 ; CHECK-NEXT:        .long   2
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.u1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.u1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-2.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-2.ll
index 90706e9..240083f 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-2.ll
@@ -20,7 +20,7 @@ target triple = "bpf"
 %struct.s1 = type { i32, i16 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !27, metadata !DIExpression()), !dbg !30
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !31, !llvm.preserve.access.index !16
@@ -59,24 +59,20 @@ entry:
 ; CHECK-NEXT:        .long   2
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-3.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-3.ll
index 2297040..57dd5b7 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-3.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-3.ll
@@ -19,7 +19,7 @@ target triple = "bpf"
 %struct.s1 = type { [10 x [10 x i32]] }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !18 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !18 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !31, metadata !DIExpression()), !dbg !34
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !35, !llvm.preserve.access.index !22
@@ -59,27 +59,23 @@ entry:
 ; CHECK-NEXT:        .long   2
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #1
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1-bpfeb.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1-bpfeb.ll
index 503a26c..7caa667 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1-bpfeb.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1-bpfeb.ll
@@ -23,7 +23,7 @@ target triple = "bpfeb"
 %struct.s1 = type { i32 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !28, metadata !DIExpression()), !dbg !33
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !34, !llvm.preserve.access.index !16
@@ -86,20 +86,16 @@ entry:
 ; CHECK-NEXT:        .long   4
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1.ll
index 0327f1a..c518573 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1.ll
@@ -23,7 +23,7 @@ target triple = "bpfel"
 %struct.s1 = type { i32 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !28, metadata !DIExpression()), !dbg !33
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !34, !llvm.preserve.access.index !16
@@ -86,20 +86,16 @@ entry:
 ; CHECK-NEXT:        .long   4
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-2.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-2.ll
index 2a92d08..6bf29d4 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-2.ll
@@ -21,7 +21,7 @@ target triple = "bpf"
 %struct.s1 = type { i32, i16 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !27, metadata !DIExpression()), !dbg !30
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !31, !llvm.preserve.access.index !16
@@ -60,24 +60,20 @@ entry:
 ; CHECK-NEXT:        .long   4
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-1.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-1.ll
index 6e62bb3..441366f 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-1.ll
@@ -22,7 +22,7 @@ target triple = "bpf"
 %struct.s1 = type { i32 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !28, metadata !DIExpression()), !dbg !33
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !34, !llvm.preserve.access.index !16
@@ -85,20 +85,16 @@ entry:
 ; CHECK-NEXT:        .long   5
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-2.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-2.ll
index 77ea26a..7bc994d 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-2.ll
@@ -20,7 +20,7 @@ target triple = "bpf"
 %struct.s1 = type { i32, i8 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !27, metadata !DIExpression()), !dbg !30
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !31, !llvm.preserve.access.index !16
@@ -59,24 +59,20 @@ entry:
 ; CHECK-NEXT:        .long   5
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-3.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-3.ll
index 556f69f..ebfecff 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-3.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-3.ll
@@ -20,7 +20,7 @@ target triple = "bpf"
 %struct.s1 = type { [5 x [5 x i8]] }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !18 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !18 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !32, metadata !DIExpression()), !dbg !35
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !36, !llvm.preserve.access.index !23
@@ -60,27 +60,23 @@ entry:
 ; CHECK-NEXT:        .long   5
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #1
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-1.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-1.ll
index 2741050..d50701c 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-1.ll
@@ -22,7 +22,7 @@ target triple = "bpf"
 %union.u1 = type { i32 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg1, ptr %arg2) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg1, ptr %arg2) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg1, metadata !29, metadata !DIExpression()), !dbg !35
   call void @llvm.dbg.value(metadata ptr %arg2, metadata !30, metadata !DIExpression()), !dbg !35
@@ -85,29 +85,25 @@ entry:
 ; CHECK-NEXT:        .long   3
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.u1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.u1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-2.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-2.ll
index b71bbf3..312d40f 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-2.ll
@@ -25,7 +25,7 @@ target triple = "bpf"
 %struct.s1 = type { i32, i16 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !20 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !20 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !37, metadata !DIExpression()), !dbg !41
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !42, !llvm.preserve.access.index !24
@@ -76,24 +76,20 @@ entry:
 ; CHECK-NEXT:        .long   3
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!16, !17, !18}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-3.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-3.ll
index 5caea97..12a21c7 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-3.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-3.ll
@@ -24,7 +24,7 @@ target triple = "bpf"
 %struct.s1 = type { [10 x i32], [10 x [10 x i32]] }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !29 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !29 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !43, metadata !DIExpression()), !dbg !46
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !47, !llvm.preserve.access.index !33
@@ -66,27 +66,23 @@ entry:
 ; CHECK-NEXT:        .long   3
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #1
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!25, !26, !27}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-struct.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-struct.ll
index 8b95b1c4..13c7d1d 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-struct.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-struct.ll
@@ -15,11 +15,11 @@ target triple = "bpf"
 %struct.s = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !17, metadata !DIExpression()), !dbg !18
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 1), !dbg !19, !llvm.preserve.access.index !12
-  %call = tail call i32 @get_value(ptr %0) #4, !dbg !20
+  %call = tail call i32 @get_value(ptr %0), !dbg !20
   ret i32 %call, !dbg !21
 }
 
@@ -39,19 +39,13 @@ entry:
 ; CHECK-NEXT: .long   26
 ; CHECK-NEXT: .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-enum-value.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-enum-value.ll
index 88658b6..8583322 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-enum-value.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-enum-value.ll
@@ -20,7 +20,7 @@ target triple = "bpf"
 @2 = private unnamed_addr constant [18 x i8] c"VAL10:-2147483648\00", align 1
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !18 {
+define dso_local i32 @test() local_unnamed_addr !dbg !18 {
 entry:
   %0 = tail call i64 @llvm.bpf.preserve.enum.value(i32 0, ptr @0, i64 0), !dbg !23, !llvm.preserve.access.index !3
   %1 = tail call i64 @llvm.bpf.preserve.enum.value(i32 1, ptr @1, i64 1), !dbg !24, !llvm.preserve.access.index !3
@@ -81,10 +81,7 @@ entry:
 ; CHECK-NEXT:        .long   11
 
 ; Function Attrs: nounwind readnone
-declare i64 @llvm.bpf.preserve.enum.value(i32, ptr, i64) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare i64 @llvm.bpf.preserve.enum.value(i32, ptr, i64)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-exist.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-exist.ll
index 0bdf954..6f316d9 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-exist.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-exist.ll
@@ -17,7 +17,7 @@
 target triple = "bpf"
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !17 {
+define dso_local i32 @test() local_unnamed_addr !dbg !17 {
 entry:
   %0 = tail call i32 @llvm.bpf.preserve.type.info(i32 0, i64 0), !dbg !19, !llvm.preserve.access.index !8
   %1 = tail call i32 @llvm.bpf.preserve.type.info(i32 1, i64 0), !dbg !20, !llvm.preserve.access.index !21
@@ -59,10 +59,7 @@ entry:
 ; CHECK-NEXT:        .long   8
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.type.info(i32, i64) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare i32 @llvm.bpf.preserve.type.info(i32, i64)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!13, !14, !15}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-1.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-1.ll
index ddd3711..d3aacc72 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-1.ll
@@ -17,7 +17,7 @@
 target triple = "bpf"
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !17 {
+define dso_local i32 @test() local_unnamed_addr !dbg !17 {
 entry:
   %0 = tail call i32 @llvm.bpf.preserve.type.info(i32 0, i64 1), !dbg !19, !llvm.preserve.access.index !8
   %1 = tail call i32 @llvm.bpf.preserve.type.info(i32 1, i64 1), !dbg !20, !llvm.preserve.access.index !21
@@ -59,10 +59,7 @@ entry:
 ; CHECK-NEXT:        .long   9
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.type.info(i32, i64) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare i32 @llvm.bpf.preserve.type.info(i32, i64)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!13, !14, !15}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-2.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-2.ll
index b2f8e48..ad4fc96 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-2.ll
@@ -20,7 +20,7 @@
 target triple = "bpf"
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !17 {
+define dso_local i32 @test() local_unnamed_addr !dbg !17 {
 entry:
   call void @llvm.dbg.declare(metadata ptr undef, metadata !20, metadata !DIExpression()), !dbg !28
   call void @llvm.dbg.declare(metadata ptr undef, metadata !19, metadata !DIExpression()), !dbg !29
@@ -65,14 +65,10 @@ entry:
 ; CHECK-NEXT:        .long   9
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.type.info(i32, i64) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable willreturn }
-attributes #2 = { nounwind readnone }
+declare i32 @llvm.bpf.preserve.type.info(i32, i64)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!13, !14, !15}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-union.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-union.ll
index ef360929..e0217dd 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-union.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-union.ll
@@ -15,11 +15,11 @@ target triple = "bpf"
 %union.u = type { i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !17, metadata !DIExpression()), !dbg !18
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.us.p0.us(ptr %arg, i32 1), !dbg !19, !llvm.preserve.access.index !12
-  %call = tail call i32 @get_value(ptr %0) #4, !dbg !20
+  %call = tail call i32 @get_value(ptr %0), !dbg !20
   ret i32 %call, !dbg !21
 }
 ; CHECK-LABEL: test
@@ -38,19 +38,13 @@ entry:
 ; CHECK-NEXT: .long   26
 ; CHECK-NEXT: .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.us.p0.us(ptr, i32 immarg) #2
+declare ptr @llvm.preserve.union.access.index.p0.us.p0.us(ptr, i32 immarg)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/no-elf-ama-symbol.ll b/llvm/test/CodeGen/BPF/CORE/no-elf-ama-symbol.ll
index 4c6ce1e..819ee31 100644
--- a/llvm/test/CodeGen/BPF/CORE/no-elf-ama-symbol.ll
+++ b/llvm/test/CodeGen/BPF/CORE/no-elf-ama-symbol.ll
@@ -15,7 +15,7 @@ target triple = "bpf"
 %struct.tt = type { i32 }
 
 ; Function Attrs: nounwind readonly
-define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !16, metadata !DIExpression()), !dbg !17
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.tts(ptr elementtype(%struct.tt) %arg, i32 0, i32 0), !dbg !18, !llvm.preserve.access.index !12
@@ -26,14 +26,10 @@ entry:
 ; CHECK-NOT: llvm.tt:0:0$0:0
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.tts(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.tts(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable}
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/no-narrow-load.ll b/llvm/test/CodeGen/BPF/CORE/no-narrow-load.ll
index 9998c98..c3f8395 100644
--- a/llvm/test/CodeGen/BPF/CORE/no-narrow-load.ll
+++ b/llvm/test/CodeGen/BPF/CORE/no-narrow-load.ll
@@ -28,7 +28,7 @@ target triple = "bpf"
 %struct.data_t = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local void @test(ptr readonly %args) local_unnamed_addr #0 !dbg !12 {
+define dso_local void @test(ptr readonly %args) local_unnamed_addr !dbg !12 {
 entry:
   %data = alloca i64, align 8
   call void @llvm.dbg.value(metadata ptr %args, metadata !22, metadata !DIExpression()), !dbg !29
@@ -36,7 +36,7 @@ entry:
   %1 = load i32, ptr %0, align 4, !dbg !30, !tbaa !31
   %and = and i32 %1, 65536, !dbg !36
   call void @llvm.dbg.value(metadata i32 %and, metadata !23, metadata !DIExpression()), !dbg !29
-  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %data) #5, !dbg !37
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %data), !dbg !37
   call void @llvm.dbg.declare(metadata ptr %data, metadata !24, metadata !DIExpression()), !dbg !38
   store i64 0, ptr %data, align 8, !dbg !38
   %tobool = icmp eq i32 %and, 0, !dbg !39
@@ -60,8 +60,8 @@ lor.end:                                          ; preds = %lor.end.critedge, %
   %5 = phi i32 [ %phitmp, %cond.false ], [ 1, %lor.end.critedge ]
   %d2 = getelementptr inbounds %struct.data_t, ptr %data, i64 0, i32 1, !dbg !49
   store i32 %5, ptr %d2, align 4, !dbg !50, !tbaa !51
-  call void @output(ptr nonnull %data) #5, !dbg !52
-  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %data) #5, !dbg !53
+  call void @output(ptr nonnull %data), !dbg !52
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %data), !dbg !53
   ret void, !dbg !53
 }
 
@@ -71,28 +71,21 @@ lor.end:                                          ; preds = %lor.end.critedge, %
 ; CHECK: r[[LOAD]] &= 32768
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #2
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.info_ts(ptr, i32 immarg, i32 immarg) #3
+declare ptr @llvm.preserve.struct.access.index.p0.p0.info_ts(ptr, i32 immarg, i32 immarg)
 
-declare !dbg !4 dso_local void @output(ptr) local_unnamed_addr #4
+declare !dbg !4 dso_local void @output(ptr) local_unnamed_addr
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #2
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable willreturn }
-attributes #2 = { argmemonly nounwind willreturn }
-attributes #3 = { nounwind readnone }
-attributes #4 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #5 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !9, !10}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-access-str.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-access-str.ll
index 5da2bbd..1ce453c 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-access-str.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-access-str.ll
@@ -18,13 +18,13 @@ target triple = "bpf"
 %struct.t = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg1, ptr %arg2) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg1, ptr %arg2) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg1, metadata !22, metadata !DIExpression()), !dbg !24
   call void @llvm.dbg.value(metadata ptr %arg2, metadata !23, metadata !DIExpression()), !dbg !24
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg1, i32 1, i32 1), !dbg !25, !llvm.preserve.access.index !12
   %1 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ts(ptr elementtype(%struct.t) %arg2, i32 1, i32 1), !dbg !26, !llvm.preserve.access.index !17
-  %call = tail call i32 @get_value(ptr %0, ptr %1) #4, !dbg !27
+  %call = tail call i32 @get_value(ptr %0, ptr %1), !dbg !27
   ret i32 %call, !dbg !28
 }
 
@@ -46,22 +46,16 @@ entry:
 ; CHECK-NEXT:        .long   [[ACCESS_STR]]
 ; CHECK-NEXT:        .long   0
 
-declare dso_local i32 @get_value(ptr, ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr, ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ts(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ts(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-basic.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-basic.ll
index 024ed04..0fdd704 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-basic.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-basic.ll
@@ -24,19 +24,19 @@ target triple = "bpf"
 %struct.net_device = type opaque
 
 ; Function Attrs: nounwind
-define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @bpf_prog(ptr) local_unnamed_addr !dbg !15 {
   %2 = alloca ptr, align 8
   call void @llvm.dbg.value(metadata ptr %0, metadata !26, metadata !DIExpression()), !dbg !28
-  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %2) #4, !dbg !29
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %2), !dbg !29
   call void @llvm.dbg.value(metadata ptr null, metadata !27, metadata !DIExpression()), !dbg !28
   store ptr null, ptr %2, align 8, !dbg !30, !tbaa !31
   %3 = tail call ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr elementtype(%struct.sk_buff) %0, i32 1, i32 1), !dbg !35, !llvm.preserve.access.index !19
-  %4 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 8, ptr %3) #4, !dbg !36
+  %4 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 8, ptr %3), !dbg !36
   %5 = load ptr, ptr %2, align 8, !dbg !37, !tbaa !31
   call void @llvm.dbg.value(metadata ptr %5, metadata !27, metadata !DIExpression()), !dbg !28
   %6 = icmp ne ptr %5, null, !dbg !38
   %7 = zext i1 %6 to i32, !dbg !38
-  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %2) #4, !dbg !39
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %2), !dbg !39
   ret i32 %7, !dbg !40
 }
 
@@ -122,22 +122,16 @@ define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
 ; CHECK-NEXT:        .long   0
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-1.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-1.ll
index e12221e..65859c86 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-1.ll
@@ -21,7 +21,7 @@ target triple = "bpf"
 %struct.v1 = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !22 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !22 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !32, metadata !DIExpression()), !dbg !33
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0(ptr elementtype(%struct.v3) %arg, i32 1, i32 1), !dbg !34, !llvm.preserve.access.index !26
@@ -30,7 +30,7 @@ entry:
   %3 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x %struct.v1]) %2, i32 0, i32 0), !dbg !34, !llvm.preserve.access.index !4
   %4 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x %struct.v1]) %3, i32 1, i32 2), !dbg !34, !llvm.preserve.access.index !5
   %5 = tail call ptr @llvm.preserve.struct.access.index.p0.p0(ptr elementtype(%struct.v1) %4, i32 1, i32 1), !dbg !34, !llvm.preserve.access.index !8
-  %call = tail call i32 @get_value(ptr %5) #4, !dbg !35
+  %call = tail call i32 @get_value(ptr %5), !dbg !35
   ret i32 %call, !dbg !36
 }
 
@@ -60,13 +60,13 @@ entry:
 ; CHECK-NEXT:         .long   107
 ; CHECK-NEXT:         .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
 
@@ -75,13 +75,7 @@ declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!18, !19, !20}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-2.ll
index 1764c9d..f42e7e6 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-2.ll
@@ -21,7 +21,7 @@ target triple = "bpf"
 %struct.v1 = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !24 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !24 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !34, metadata !DIExpression()), !dbg !35
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0(ptr elementtype(%struct.v3) %arg, i32 1, i32 1), !dbg !36, !llvm.preserve.access.index !28
@@ -31,7 +31,7 @@ entry:
   %4 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x [4 x %struct.v1]]) %3, i32 1, i32 2), !dbg !36, !llvm.preserve.access.index !5
   %5 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x %struct.v1]) %4, i32 1, i32 3), !dbg !36, !llvm.preserve.access.index !18
   %6 = tail call ptr @llvm.preserve.struct.access.index.p0.p0(ptr elementtype(%struct.v1) %5, i32 1, i32 1), !dbg !36, !llvm.preserve.access.index !8
-  %call = tail call i32 @get_value(ptr %6) #4, !dbg !37
+  %call = tail call i32 @get_value(ptr %6), !dbg !37
   ret i32 %call, !dbg !38
 }
 
@@ -62,13 +62,13 @@ entry:
 ; CHECK-NEXT:         .long   107
 ; CHECK-NEXT:         .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
 
@@ -79,13 +79,7 @@ declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!20, !21, !22}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-1.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-1.ll
index bbff3f6..38b1c99 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-1.ll
@@ -21,12 +21,12 @@ target triple = "bpf"
 %struct.v1 = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !14 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !14 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !28, metadata !DIExpression()), !dbg !29
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.v2s.p0.v3s(ptr elementtype(%struct.v3) %arg, i32 1, i32 1), !dbg !30, !llvm.preserve.access.index !18
   %1 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.v1s(ptr elementtype(%struct.v1) %0, i32 1, i32 1), !dbg !30, !llvm.preserve.access.index !5
-  %call = tail call i32 @get_value(ptr %1) #4, !dbg !31
+  %call = tail call i32 @get_value(ptr %1), !dbg !31
   ret i32 %call, !dbg !32
 }
 
@@ -60,22 +60,16 @@ entry:
 ; CHECK-NEXT:        .long   [[ACCESS_STR]]
 ; CHECK-NEXT:        .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.v2s.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.v2s.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v1s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!10, !11, !12}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-2.ll
index bdc17e6..7730ee3a 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-2.ll
@@ -24,12 +24,12 @@ target triple = "bpf"
 %struct.v1 = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !15 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !33, metadata !DIExpression()), !dbg !34
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.v2s.p0.v3s(ptr elementtype(%struct.v3) %arg, i32 1, i32 1), !dbg !35, !llvm.preserve.access.index !20
   %1 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.v1s(ptr elementtype(%struct.v1) %0, i32 1, i32 1), !dbg !35, !llvm.preserve.access.index !6
-  %call = tail call i32 @get_value(ptr %1) #4, !dbg !36
+  %call = tail call i32 @get_value(ptr %1), !dbg !36
   ret i32 %call, !dbg !37
 }
 
@@ -47,7 +47,6 @@ entry:
 ; CHECK:             .ascii  "0:1"                   # string offset=45
 ; CHECK:             .ascii  "v1"                    # string offset=91
 
-
 ; CHECK:             .long   16                      # FieldReloc
 ; CHECK-NEXT:        .long   39                      # Field reloc section string offset=39
 ; CHECK-NEXT:        .long   2
@@ -60,22 +59,16 @@ entry:
 ; CHECK-NEXT:        .long   45
 ; CHECK-NEXT:        .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.v2s.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.v2s.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v1s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-3.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-3.ll
index dea6e40..e5ef549 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-3.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-3.ll
@@ -22,14 +22,14 @@ target triple = "bpf"
 %struct.v1 = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !19 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !19 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !30, metadata !DIExpression()), !dbg !31
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0(ptr elementtype(%struct.v3) %arg, i32 1, i32 1), !dbg !32, !llvm.preserve.access.index !24
   %1 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([40 x i32]) %0, i32 1, i32 4), !dbg !32, !llvm.preserve.access.index !11
   %2 = bitcast ptr %1 to ptr, !dbg !32
   %3 = tail call ptr @llvm.preserve.struct.access.index.p0.p0(ptr elementtype(%struct.v1) %2, i32 1, i32 1), !dbg !32, !llvm.preserve.access.index !6
-  %call = tail call i32 @get_value(ptr %3) #4, !dbg !33
+  %call = tail call i32 @get_value(ptr %3), !dbg !33
   ret i32 %call, !dbg !34
 }
 
@@ -60,24 +60,18 @@ entry:
 ; CHECK-NEXT:        .long   118
 ; CHECK-NEXT:        .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!15, !16, !17}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-1.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-1.ll
index 98fdfde..7aeaed4 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-1.ll
@@ -24,14 +24,14 @@ target triple = "bpf"
 %union.v1 = type { i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !15 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !33, metadata !DIExpression()), !dbg !34
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.p0(ptr %arg, i32 1), !dbg !35, !llvm.preserve.access.index !20
   %1 = bitcast ptr %0 to ptr, !dbg !35
   %2 = tail call ptr @llvm.preserve.union.access.index.p0.p0(ptr %1, i32 1), !dbg !35, !llvm.preserve.access.index !6
   %b = getelementptr inbounds %union.v1, ptr %2, i64 0, i32 0, !dbg !35
-  %call = tail call i32 @get_value(ptr %b) #4, !dbg !36
+  %call = tail call i32 @get_value(ptr %b), !dbg !36
   ret i32 %call, !dbg !37
 }
 
@@ -61,21 +61,15 @@ entry:
 ; CHECK-NEXT:        .long   45
 ; CHECK-NEXT:        .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.p0(ptr, i32) #2
+declare ptr @llvm.preserve.union.access.index.p0.p0(ptr, i32)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-2.ll
index 7b63699..12c3936 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-2.ll
@@ -22,7 +22,7 @@ target triple = "bpf"
 %union.v1 = type { i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !19 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !19 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !30, metadata !DIExpression()), !dbg !31
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.p0(ptr %arg, i32 1), !dbg !32, !llvm.preserve.access.index !24
@@ -31,7 +31,7 @@ entry:
   %2 = bitcast ptr %1 to ptr, !dbg !32
   %3 = tail call ptr @llvm.preserve.union.access.index.p0.p0(ptr %2, i32 1), !dbg !32, !llvm.preserve.access.index !6
   %b = getelementptr inbounds %union.v1, ptr %3, i64 0, i32 0, !dbg !32
-  %call = tail call i32 @get_value(ptr %b) #4, !dbg !33
+  %call = tail call i32 @get_value(ptr %b), !dbg !33
   ret i32 %call, !dbg !34
 }
 
@@ -62,24 +62,18 @@ entry:
 ; CHECK-NEXT:        .long   118
 ; CHECK-NEXT:        .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.p0(ptr, i32) #2
+declare ptr @llvm.preserve.union.access.index.p0.p0(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!15, !16, !17}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-load.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-load.ll
index 499e368..ee1f0e2 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-load.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-load.ll
@@ -14,7 +14,7 @@ target triple = "bpf"
 %struct.s = type { i32, i32 }
 
 ; Function Attrs: nounwind readonly
-define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !20, metadata !DIExpression()), !dbg !21
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 1), !dbg !22, !llvm.preserve.access.index !15
@@ -42,14 +42,10 @@ entry:
 ; CHECK-NEXT:  .long   0
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-ret.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-ret.ll
index 2aadbdf..3d66435 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-ret.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-ret.ll
@@ -14,7 +14,7 @@ target triple = "bpf"
 %struct.s = type { i32, i32 }
 
 ; Function Attrs: nounwind readnone
-define dso_local ptr @test(ptr readnone %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local ptr @test(ptr readnone %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !19, metadata !DIExpression()), !dbg !20
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 1), !dbg !21, !llvm.preserve.access.index !13
@@ -42,14 +42,10 @@ entry:
 ; CHECK-NEXT:  .long   0
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-1.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-1.ll
index 34ea050..cf75909 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-1.ll
@@ -40,11 +40,11 @@ target triple = "bpfel"
 %struct.s = type { i32, i16 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @field_read(ptr %arg) local_unnamed_addr #0 !dbg !20 {
+define dso_local i32 @field_read(ptr %arg) local_unnamed_addr !dbg !20 {
 entry:
   %ull = alloca i64, align 8
   call void @llvm.dbg.value(metadata ptr %arg, metadata !31, metadata !DIExpression()), !dbg !37
-  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %ull) #5, !dbg !38
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %ull), !dbg !38
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 2), !dbg !39, !llvm.preserve.access.index !25
   %1 = tail call i32 @llvm.bpf.preserve.field.info.p0(ptr %0, i64 0), !dbg !40
   call void @llvm.dbg.value(metadata i32 %1, metadata !34, metadata !DIExpression()), !dbg !37
@@ -52,7 +52,7 @@ entry:
   call void @llvm.dbg.value(metadata i32 %2, metadata !35, metadata !DIExpression()), !dbg !37
   %idx.ext = zext i32 %1 to i64, !dbg !43
   %add.ptr = getelementptr i8, ptr %arg, i64 %idx.ext, !dbg !43
-  call void @bpf_probe_read(ptr nonnull %ull, i32 %2, ptr %add.ptr) #5, !dbg !44
+  call void @bpf_probe_read(ptr nonnull %ull, i32 %2, ptr %add.ptr), !dbg !44
   %3 = call i32 @llvm.bpf.preserve.field.info.p0(ptr %0, i64 4), !dbg !45
   call void @llvm.dbg.value(metadata i32 %3, metadata !36, metadata !DIExpression()), !dbg !37
   %4 = load i64, ptr %ull, align 8, !dbg !46, !tbaa !47
@@ -68,7 +68,7 @@ entry:
   %shr3 = lshr i64 %shl, %sh_prom1, !dbg !53
   %retval.0.in = select i1 %tobool, i64 %shr3, i64 %shr, !dbg !53
   %retval.0 = trunc i64 %retval.0.in to i32, !dbg !37
-  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %ull) #5, !dbg !54
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %ull), !dbg !54
   ret i32 %retval.0, !dbg !54
 }
 
@@ -114,28 +114,21 @@ entry:
 ; CHECK-NEXT:        .long   3
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #2
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
-declare dso_local void @bpf_probe_read(ptr, i32, ptr) local_unnamed_addr #3
+declare dso_local void @bpf_probe_read(ptr, i32, ptr) local_unnamed_addr
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #4
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { nounwind readnone }
-attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind readnone speculatable willreturn }
-attributes #5 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!16, !17, !18}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2-bpfeb.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2-bpfeb.ll
index 01c5e69..d5b2d052 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2-bpfeb.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2-bpfeb.ll
@@ -42,7 +42,7 @@ target triple = "bpfeb"
 %struct.s = type { i32, i16 }
 
 ; Function Attrs: nounwind readonly
-define dso_local i32 @field_read(ptr %arg) local_unnamed_addr #0 !dbg !26 {
+define dso_local i32 @field_read(ptr %arg) local_unnamed_addr !dbg !26 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !37, metadata !DIExpression()), !dbg !41
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 2), !dbg !42, !llvm.preserve.access.index !31
@@ -157,17 +157,13 @@ sw.epilog:                                        ; preds = %entry, %sw.bb9, %sw
 ; CHECK-NEXT:        .long   3
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!22, !23, !24}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2.ll
index d458d41..5076e79 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2.ll
@@ -42,7 +42,7 @@ target triple = "bpfel"
 %struct.s = type { i32, i16 }
 
 ; Function Attrs: nounwind readonly
-define dso_local i32 @field_read(ptr %arg) local_unnamed_addr #0 !dbg !26 {
+define dso_local i32 @field_read(ptr %arg) local_unnamed_addr !dbg !26 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !37, metadata !DIExpression()), !dbg !41
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 2), !dbg !42, !llvm.preserve.access.index !31
@@ -157,17 +157,13 @@ sw.epilog:                                        ; preds = %entry, %sw.bb9, %sw
 ; CHECK-NEXT:        .long   3
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!22, !23, !24}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-1.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-1.ll
index 7657b78..2f42118 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-1.ll
@@ -19,10 +19,10 @@ target triple = "bpf"
 @g = dso_local global %struct.v3 zeroinitializer, section "stats", align 4, !dbg !0
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !16 {
+define dso_local i32 @test() local_unnamed_addr !dbg !16 {
 entry:
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr elementtype(%struct.v3) nonnull @g, i32 1, i32 1), !dbg !19, !llvm.preserve.access.index !7
-  %call = tail call i32 @get_value(ptr %0) #3, !dbg !20
+  %call = tail call i32 @get_value(ptr %0), !dbg !20
   ret i32 %call, !dbg !21
 }
 
@@ -45,15 +45,10 @@ entry:
 ; CHECK-NEXT:         .long   23
 ; CHECK-NEXT:         .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32) #2
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind }
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32)
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!12, !13, !14}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-2.ll
index bed14ab..f43df76 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-2.ll
@@ -19,12 +19,12 @@ target triple = "bpf"
 @g = dso_local global [4 x [5 x %struct.v3]] zeroinitializer, section "stats", align 4, !dbg !0
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !23 {
+define dso_local i32 @test() local_unnamed_addr !dbg !23 {
 entry:
   %0 = tail call ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr elementtype([4 x [5 x %struct.v3]]) nonnull @g, i32 1, i32 1), !dbg !26, !llvm.preserve.access.index !6
   %1 = tail call ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr elementtype([5 x %struct.v3]) %0, i32 1, i32 2), !dbg !26, !llvm.preserve.access.index !16
   %2 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr elementtype(%struct.v3) %1, i32 1, i32 1), !dbg !26, !llvm.preserve.access.index !8
-  %call = tail call i32 @get_value(ptr %2) #3, !dbg !27
+  %call = tail call i32 @get_value(ptr %2), !dbg !27
   ret i32 %call, !dbg !28
 }
 
@@ -47,21 +47,15 @@ entry:
 ; CHECK-NEXT:         .long   23
 ; CHECK-NEXT:         .long   0
 
-
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32) #2
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind }
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32)
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!19, !20, !21}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-3.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-3.ll
index 49b89e2..5bc2bf9 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-3.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-3.ll
@@ -19,11 +19,11 @@ target triple = "bpf"
 @g = dso_local local_unnamed_addr global ptr null, section "stats", align 8, !dbg !0
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !17 {
+define dso_local i32 @test() local_unnamed_addr !dbg !17 {
 entry:
   %0 = load ptr, ptr @g, align 8, !dbg !20, !tbaa !21
   %1 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr elementtype(%struct.v3) %0, i32 1, i32 1), !dbg !20, !llvm.preserve.access.index !8
-  %call = tail call i32 @get_value(ptr %1) #3, !dbg !25
+  %call = tail call i32 @get_value(ptr %1), !dbg !25
   ret i32 %call, !dbg !26
 }
 
@@ -45,15 +45,10 @@ entry:
 ; CHECK-NEXT:         .long   23
 ; CHECK-NEXT:         .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32) #2
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind }
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32)
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!13, !14, !15}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-ignore.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-ignore.ll
index 4ff170cf..983383c 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-ignore.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-ignore.ll
@@ -13,11 +13,11 @@
 target triple = "bpf"
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !10 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !10 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !14, metadata !DIExpression()), !dbg !15
   %0 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype(i32) %arg, i32 0, i32 4), !dbg !16, !llvm.preserve.access.index !4
-  %call = tail call i32 @get_value(ptr %0) #4, !dbg !17
+  %call = tail call i32 @get_value(ptr %0), !dbg !17
   ret i32 %call, !dbg !18
 }
 
@@ -26,19 +26,13 @@ entry:
 ; CHECK:             .section        .BTF.ext,"",@progbits
 ; CHECK-NOT:         .long   16                      # FieldReloc
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!6, !7, !8}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-middle-chain.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-middle-chain.ll
index e5f86c2..c67d57f 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-middle-chain.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-middle-chain.ll
@@ -29,7 +29,7 @@ target triple = "bpf"
 %struct.t1 = type { i32 }
 
 ; Function Attrs: nounwind
-define dso_local void @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local void @test(ptr %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !22, metadata !DIExpression()), !dbg !29
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.s1s.p0.r1s(ptr elementtype(%struct.r1) %arg, i32 0, i32 0), !dbg !30, !llvm.preserve.access.index !11
@@ -38,7 +38,7 @@ entry:
   call void @llvm.dbg.value(metadata ptr %1, metadata !25, metadata !DIExpression()), !dbg !29
   %2 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.t1s(ptr elementtype(%struct.t1) %1, i32 0, i32 0), !dbg !32, !llvm.preserve.access.index !17
   call void @llvm.dbg.value(metadata ptr %2, metadata !27, metadata !DIExpression()), !dbg !29
-  tail call void @test1(ptr %0, ptr %1, ptr %2) #4, !dbg !36
+  tail call void @test1(ptr %0, ptr %1, ptr %2), !dbg !36
   ret void, !dbg !37
 }
 
@@ -67,24 +67,18 @@ entry:
 ; CHECK-NEXT:        .long   0
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.s1s.p0.r1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.s1s.p0.r1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.t1s.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.t1s.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.t1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.t1s(ptr, i32, i32)
 
-declare dso_local void @test1(ptr, ptr, ptr) local_unnamed_addr #2
+declare dso_local void @test1(ptr, ptr, ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-1.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-1.ll
index 8ca3ef5..7ffb4de 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-1.ll
@@ -17,14 +17,14 @@ target triple = "bpf"
 %struct.v3 = type { i32, [4 x [4 x i32]] }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !21 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !21 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !25, metadata !DIExpression()), !dbg !26
   %0 = tail call ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr elementtype(%struct.v3) %arg, i32 0, i32 1), !dbg !27, !llvm.preserve.access.index !4
   %1 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr elementtype(%struct.v3) %0, i32 1, i32 1), !dbg !27, !llvm.preserve.access.index !6
   %2 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x [4 x i32]]) %1, i32 1, i32 2), !dbg !27, !llvm.preserve.access.index !11
   %3 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x i32]) %2, i32 1, i32 3), !dbg !27, !llvm.preserve.access.index !15
-  %call = tail call i32 @get_value(ptr %3) #4, !dbg !28
+  %call = tail call i32 @get_value(ptr %3), !dbg !28
   ret i32 %call, !dbg !29
 }
 
@@ -46,27 +46,21 @@ entry:
 ; CHECK-NEXT:         .long   58
 ; CHECK-NEXT:         .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!17, !18, !19}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-2.ll
index b2ba5a8..55bb7c58 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-2.ll
@@ -17,7 +17,7 @@ target triple = "bpf"
 %struct.v3 = type { i32, [4 x [4 x [4 x i32]]] }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !23 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !23 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !27, metadata !DIExpression()), !dbg !28
   %0 = tail call ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr elementtype(%struct.v3) %arg, i32 0, i32 1), !dbg !29, !llvm.preserve.access.index !4
@@ -25,7 +25,7 @@ entry:
   %2 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x [4 x [4 x i32]]]) %1, i32 1, i32 2), !dbg !29, !llvm.preserve.access.index !11
   %3 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x [4 x i32]]) %2, i32 1, i32 3), !dbg !29, !llvm.preserve.access.index !15
   %4 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x i32]) %3, i32 1, i32 2), !dbg !29, !llvm.preserve.access.index !17
-  %call = tail call i32 @get_value(ptr %4) #4, !dbg !30
+  %call = tail call i32 @get_value(ptr %4), !dbg !30
   ret i32 %call, !dbg !31
 }
 
@@ -47,29 +47,23 @@ entry:
 ; CHECK-NEXT:        .long   58
 ; CHECK-NEXT:        .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!19, !20, !21}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-multilevel.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-multilevel.ll
index e00bbb8..a5b4604 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-multilevel.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-multilevel.ll
@@ -28,16 +28,16 @@ target triple = "bpf"
 %struct.net_device = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @bpf_prog(ptr) local_unnamed_addr !dbg !15 {
   %2 = alloca i32, align 4
   call void @llvm.dbg.value(metadata ptr %0, metadata !28, metadata !DIExpression()), !dbg !30
-  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2) #4, !dbg !31
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2), !dbg !31
   %3 = tail call ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr elementtype(%struct.sk_buff) %0, i32 1, i32 1), !dbg !32, !llvm.preserve.access.index !19
   %4 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.net_devices(ptr elementtype(%struct.net_device) %3, i32 0, i32 0), !dbg !32, !llvm.preserve.access.index !23
-  %5 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %4) #4, !dbg !33
+  %5 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %4), !dbg !33
   %6 = load i32, ptr %2, align 4, !dbg !34, !tbaa !35
   call void @llvm.dbg.value(metadata i32 %6, metadata !29, metadata !DIExpression()), !dbg !30
-  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2) #4, !dbg !39
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2), !dbg !39
   ret i32 %6, !dbg !40
 }
 
@@ -130,25 +130,19 @@ define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
 ; CHECK-NEXT:        .long   0
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.net_devices(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.net_devices(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-1.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-1.ll
index b4d1844..ffd77ed 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-1.ll
@@ -16,11 +16,11 @@ target triple = "bpf"
 %struct.v3 = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !15 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !19, metadata !DIExpression()), !dbg !20
   %0 = tail call ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr elementtype(%struct.v3) %arg, i32 0, i32 1), !dbg !21, !llvm.preserve.access.index !4
-  %call = tail call i32 @get_value(ptr %0) #4, !dbg !22
+  %call = tail call i32 @get_value(ptr %0), !dbg !22
   ret i32 %call, !dbg !23
 }
 
@@ -42,19 +42,13 @@ entry:
 ; CHECK-NEXT:         .long   32
 ; CHECK-NEXT:         .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-2.ll
index 87b88bc..cb0aff3 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-2.ll
@@ -16,12 +16,12 @@ target triple = "bpf"
 %struct.v3 = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !15 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !19, metadata !DIExpression()), !dbg !20
   %0 = tail call ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr elementtype(%struct.v3) %arg, i32 0, i32 1), !dbg !21, !llvm.preserve.access.index !4
   %1 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr elementtype(%struct.v3) %0, i32 1, i32 1), !dbg !21, !llvm.preserve.access.index !6
-  %call = tail call i32 @get_value(ptr %1) #4, !dbg !22
+  %call = tail call i32 @get_value(ptr %1), !dbg !22
   ret i32 %call, !dbg !23
 }
 
@@ -42,22 +42,16 @@ entry:
 ; CHECK-NEXT:         .long   32
 ; CHECK-NEXT:         .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-anonymous.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-anonymous.ll
index 8ebbfea..2081b3f 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-anonymous.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-anonymous.ll
@@ -27,17 +27,17 @@ target triple = "bpf"
 %struct.anon = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @bpf_prog(ptr) local_unnamed_addr !dbg !15 {
   %2 = alloca i32, align 4
   call void @llvm.dbg.value(metadata ptr %0, metadata !31, metadata !DIExpression()), !dbg !33
-  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2) #4, !dbg !34
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2), !dbg !34
   %3 = tail call ptr @llvm.preserve.struct.access.index.p0.anons.p0.sk_buffs(ptr elementtype(%struct.sk_buff) %0, i32 1, i32 1), !dbg !35, !llvm.preserve.access.index !19
   %4 = tail call ptr @llvm.preserve.array.access.index.p0.anons.p0.anons(ptr elementtype([10 x %struct.anon]) %3, i32 1, i32 5), !dbg !35, !llvm.preserve.access.index !23
   %5 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.anons(ptr elementtype(%struct.anon) %4, i32 0, i32 0), !dbg !35, !llvm.preserve.access.index !24
-  %6 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %5) #4, !dbg !36
+  %6 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %5), !dbg !36
   %7 = load i32, ptr %2, align 4, !dbg !37, !tbaa !38
   call void @llvm.dbg.value(metadata i32 %7, metadata !32, metadata !DIExpression()), !dbg !33
-  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2) #4, !dbg !42
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2), !dbg !42
   ret i32 %7, !dbg !43
 }
 
@@ -140,28 +140,22 @@ define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
 ; CHECK-NEXT:        .long   0
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.anons.p0.sk_buffs(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.anons.p0.sk_buffs(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.anons.p0.anons(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.array.access.index.p0.anons.p0.anons(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.anons(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.anons(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-array.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-array.ll
index 64ec250..4e51366 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-array.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-array.ll
@@ -28,17 +28,17 @@ target triple = "bpf"
 %struct.net_device = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @bpf_prog(ptr) local_unnamed_addr !dbg !15 {
   %2 = alloca i32, align 4
   call void @llvm.dbg.value(metadata ptr %0, metadata !31, metadata !DIExpression()), !dbg !33
-  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2) #4, !dbg !34
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2), !dbg !34
   %3 = tail call ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr elementtype(%struct.sk_buff) %0, i32 1, i32 1), !dbg !35, !llvm.preserve.access.index !19
   %4 = tail call ptr @llvm.preserve.array.access.index.p0.net_devices.p0.net_devices(ptr elementtype([10 x %struct.net_device]) %3, i32 1, i32 5), !dbg !35, !llvm.preserve.access.index !23
   %5 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.net_devices(ptr elementtype(%struct.net_device) %4, i32 0, i32 0), !dbg !35, !llvm.preserve.access.index !24
-  %6 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %5) #4, !dbg !36
+  %6 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %5), !dbg !36
   %7 = load i32, ptr %2, align 4, !dbg !37, !tbaa !38
   call void @llvm.dbg.value(metadata i32 %7, metadata !32, metadata !DIExpression()), !dbg !33
-  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2) #4, !dbg !42
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2), !dbg !42
   ret i32 %7, !dbg !43
 }
 
@@ -143,28 +143,22 @@ define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
 ; CHECK-NEXT:        .long   0
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.net_devices.p0.net_devices(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.array.access.index.p0.net_devices.p0.net_devices(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.net_devices(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.net_devices(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-array.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-array.ll
index ed462e1..eb0620d 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-array.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-array.ll
@@ -20,12 +20,12 @@ target triple = "bpf"
 %struct.__s = type { [7 x i32] }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !24, metadata !DIExpression()), !dbg !25
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.__ss(ptr elementtype(%struct.__s) %arg, i32 0, i32 0), !dbg !26, !llvm.preserve.access.index !13
   %1 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([7 x i32]) %0, i32 1, i32 1), !dbg !26, !llvm.preserve.access.index !19
-  %call = tail call i32 @get_value(ptr %1) #4, !dbg !27
+  %call = tail call i32 @get_value(ptr %1), !dbg !27
   ret i32 %call, !dbg !28
 }
 
@@ -48,22 +48,16 @@ entry:
 ; CHECK-NEXT:    .long   [[ACCESS_STR]]
 ; CHECK-NEXT:    .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.__ss(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.__ss(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct-2.ll
index 6b806ae..c4edda1 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct-2.ll
@@ -18,7 +18,7 @@ target triple = "bpf"
 %struct.__t = type { i32 }
 
 ; Function Attrs: nounwind readonly
-define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr !dbg !13 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !18, metadata !DIExpression()), !dbg !19
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.__ts(ptr elementtype(%struct.__t) %arg, i32 0, i32 0), !dbg !20, !llvm.preserve.access.index !4
@@ -50,14 +50,10 @@ entry:
 ; CHECK-NEXT:        .long   0
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.__ts(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.__ts(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable}
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct.ll
index c2b5a11..f8cf253 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct.ll
@@ -20,11 +20,11 @@ target triple = "bpf"
 %struct.__s = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !21, metadata !DIExpression()), !dbg !22
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.__ss(ptr elementtype(%struct.__s) %arg, i32 1, i32 1), !dbg !23, !llvm.preserve.access.index !14
-  %call = tail call i32 @get_value(ptr %0) #4, !dbg !24
+  %call = tail call i32 @get_value(ptr %0), !dbg !24
   ret i32 %call, !dbg !25
 }
 
@@ -47,19 +47,13 @@ entry:
 ; CHECK-NEXT:   .long   [[ACCESS_STR]]
 ; CHECK-NEXT:   .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.__ss(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.__ss(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union-2.ll
index a63b7e7..0fe7c1f 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union-2.ll
@@ -18,7 +18,7 @@ target triple = "bpf"
 %union.__t = type { i32 }
 
 ; Function Attrs: nounwind readonly
-define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr !dbg !13 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !18, metadata !DIExpression()), !dbg !19
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.__ts.p0.__ts(ptr %arg, i32 0), !dbg !20, !llvm.preserve.access.index !4
@@ -50,14 +50,10 @@ entry:
 ; CHECK-NEXT:        .long   0
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.__ts.p0.__ts(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.__ts.p0.__ts(ptr, i32)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable}
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union.ll
index 4b3d178..aa8705d 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union.ll
@@ -20,11 +20,11 @@ target triple = "bpf"
 %union.__s = type { i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !21, metadata !DIExpression()), !dbg !22
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.__ss.p0.__ss(ptr %arg, i32 1), !dbg !23, !llvm.preserve.access.index !14
-  %call = tail call i32 @get_value(ptr %0) #4, !dbg !24
+  %call = tail call i32 @get_value(ptr %0), !dbg !24
   ret i32 %call, !dbg !25
 }
 
@@ -47,19 +47,13 @@ entry:
 ; CHECK-NEXT:    .long   [[ACCESS_STR]]
 ; CHECK-NEXT:    .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.__ss.p0.__ss(ptr, i32 immarg) #2
+declare ptr @llvm.preserve.union.access.index.p0.__ss.p0.__ss(ptr, i32 immarg)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef.ll
index e757327..5195d17 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef.ll
@@ -24,13 +24,13 @@ target triple = "bpf"
 %struct.s = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !28, metadata !DIExpression()), !dbg !29
   %0 = tail call ptr @llvm.preserve.array.access.index.p0.us.p0.us(ptr elementtype([7 x %union.u]) %arg, i32 0, i32 1), !dbg !30, !llvm.preserve.access.index !14
   %1 = tail call ptr @llvm.preserve.union.access.index.p0.us.p0.us(ptr %0, i32 1), !dbg !30, !llvm.preserve.access.index !16
   %2 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %1, i32 1, i32 1), !dbg !30, !llvm.preserve.access.index !20
-  %call = tail call i32 @get_value(ptr %2) #4, !dbg !31
+  %call = tail call i32 @get_value(ptr %2), !dbg !31
   ret i32 %call, !dbg !32
 }
 
@@ -53,25 +53,19 @@ entry:
 ; CHECK-NEXT:    .long   [[ACCESS_STR:[0-9]+]]
 ; CHECK-NEXT:    .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.us.p0.us(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.array.access.index.p0.us.p0.us(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.us.p0.us(ptr, i32 immarg) #2
+declare ptr @llvm.preserve.union.access.index.p0.us.p0.us(ptr, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-union.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-union.ll
index 824eba9a..e156999 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-union.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-union.ll
@@ -31,17 +31,17 @@ target triple = "bpf"
 %union.anon = type { i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @bpf_prog(ptr) local_unnamed_addr !dbg !15 {
   %2 = alloca i32, align 4
   call void @llvm.dbg.value(metadata ptr %0, metadata !32, metadata !DIExpression()), !dbg !34
-  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2) #4, !dbg !35
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2), !dbg !35
   %3 = tail call ptr @llvm.preserve.union.access.index.p0.sk_buffs.p0.sk_buffs(ptr %0, i32 1), !dbg !36, !llvm.preserve.access.index !19
   %4 = tail call ptr @llvm.preserve.struct.access.index.p0.anons.p0.anons(ptr elementtype(%struct.anon) %3, i32 1, i32 1), !dbg !36, !llvm.preserve.access.index !23
   %5 = tail call ptr @llvm.preserve.union.access.index.p0.anons.p0.anons(ptr %4, i32 0), !dbg !36, !llvm.preserve.access.index !27
-  %6 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %5) #4, !dbg !37
+  %6 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %5), !dbg !37
   %7 = load i32, ptr %2, align 4, !dbg !38, !tbaa !39
   call void @llvm.dbg.value(metadata i32 %7, metadata !33, metadata !DIExpression()), !dbg !34
-  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2) #4, !dbg !43
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2), !dbg !43
   ret i32 %7, !dbg !44
 }
 
@@ -145,28 +145,22 @@ define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
 ; CHECK-NEXT:        .long   0
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.sk_buffs.p0.sk_buffs(ptr, i32 immarg) #2
+declare ptr @llvm.preserve.union.access.index.p0.sk_buffs.p0.sk_buffs(ptr, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.anons.p0.anons(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.anons.p0.anons(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.anons.p0.anons(ptr, i32 immarg) #2
+declare ptr @llvm.preserve.union.access.index.p0.anons.p0.anons(ptr, i32 immarg)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/store-addr.ll b/llvm/test/CodeGen/BPF/CORE/store-addr.ll
index 33bbd01..2c8a0c4 100644
--- a/llvm/test/CodeGen/BPF/CORE/store-addr.ll
+++ b/llvm/test/CodeGen/BPF/CORE/store-addr.ll
@@ -22,17 +22,17 @@ target triple = "bpf"
 %struct.t = type { i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !14 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !14 {
 entry:
   %param = alloca [1 x i64], align 8
   call void @llvm.dbg.value(metadata ptr %arg, metadata !22, metadata !DIExpression()), !dbg !27
-  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %param) #5, !dbg !28
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %param), !dbg !28
   call void @llvm.dbg.declare(metadata ptr %param, metadata !23, metadata !DIExpression()), !dbg !29
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ts(ptr elementtype(%struct.t) %arg, i32 0, i32 0), !dbg !30, !llvm.preserve.access.index !18
   %1 = ptrtoint ptr %0 to i64, !dbg !31
   store i64 %1, ptr %param, align 8, !dbg !33, !tbaa !34
-  %call = call i32 @foo(ptr nonnull %param) #5, !dbg !38
-  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %param) #5, !dbg !39
+  %call = call i32 @foo(ptr nonnull %param), !dbg !38
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %param), !dbg !39
   ret i32 %call, !dbg !40
 }
 
@@ -41,28 +41,21 @@ entry:
 ; CHECK:  *(u64 *)(r10 - 8) = r1
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #2
+declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ts(ptr, i32, i32) #3
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ts(ptr, i32, i32)
 
-declare !dbg !5 dso_local i32 @foo(ptr) local_unnamed_addr #4
+declare !dbg !5 dso_local i32 @foo(ptr) local_unnamed_addr
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #2
+declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
-attributes #2 = { argmemonly nounwind }
-attributes #3 = { nounwind readnone }
-attributes #4 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #5 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!10, !11, !12}
diff --git a/llvm/test/CodeGen/BPF/adjust-opt-icmp1.ll b/llvm/test/CodeGen/BPF/adjust-opt-icmp1.ll
index 8a4b37d..09ca422 100644
--- a/llvm/test/CodeGen/BPF/adjust-opt-icmp1.ll
+++ b/llvm/test/CodeGen/BPF/adjust-opt-icmp1.ll
@@ -20,12 +20,12 @@
 ;   clang -target bpf -O2 -S -emit-llvm -Xclang -disable-llvm-passes test.c
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() #0 {
+define dso_local i32 @test() {
 entry:
   %retval = alloca i32, align 4
   %ret = alloca i32, align 4
   %cleanup.dest.slot = alloca i32, align 4
-  call void @llvm.lifetime.start.p0(i64 4, ptr %ret) #3
+  call void @llvm.lifetime.start.p0(i64 4, ptr %ret)
   %call = call i32 @foo()
   store i32 %call, ptr %ret, align 4, !tbaa !2
   %0 = load i32, ptr %ret, align 4, !tbaa !2
@@ -62,25 +62,20 @@ if.end:                                           ; preds = %lor.lhs.false
   br label %cleanup
 
 cleanup:                                          ; preds = %if.end, %if.then
-  call void @llvm.lifetime.end.p0(i64 4, ptr %ret) #3
+  call void @llvm.lifetime.end.p0(i64 4, ptr %ret)
   %3 = load i32, ptr %retval, align 4
   ret i32 %3
 }
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
-declare dso_local i32 @foo(...) #2
+declare dso_local i32 @foo(...)
 
-declare dso_local i32 @bar(i32) #2
+declare dso_local i32 @bar(i32)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/adjust-opt-icmp2.ll b/llvm/test/CodeGen/BPF/adjust-opt-icmp2.ll
index ad157fe..bbda062 100644
--- a/llvm/test/CodeGen/BPF/adjust-opt-icmp2.ll
+++ b/llvm/test/CodeGen/BPF/adjust-opt-icmp2.ll
@@ -18,12 +18,12 @@
 ;   clang -target bpf -O2 -S -emit-llvm -Xclang -disable-llvm-passes test.c
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() #0 {
+define dso_local i32 @test() {
 entry:
   %retval = alloca i32, align 4
   %ret = alloca i32, align 4
   %cleanup.dest.slot = alloca i32, align 4
-  call void @llvm.lifetime.start.p0(i64 4, ptr %ret) #3
+  call void @llvm.lifetime.start.p0(i64 4, ptr %ret)
   %call = call i32 @foo()
   store i32 %call, ptr %ret, align 4, !tbaa !2
   %0 = load i32, ptr %ret, align 4, !tbaa !2
@@ -65,25 +65,20 @@ if.end3:                                          ; preds = %if.end
   br label %cleanup
 
 cleanup:                                          ; preds = %if.end3, %if.then2, %if.then
-  call void @llvm.lifetime.end.p0(i64 4, ptr %ret) #3
+  call void @llvm.lifetime.end.p0(i64 4, ptr %ret)
   %3 = load i32, ptr %retval, align 4
   ret i32 %3
 }
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
-declare dso_local i32 @foo(...) #2
+declare dso_local i32 @foo(...)
 
-declare dso_local i32 @bar(i32) #2
+declare dso_local i32 @bar(i32)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/adjust-opt-speculative1.ll b/llvm/test/CodeGen/BPF/adjust-opt-speculative1.ll
index d118fa0..d34d652 100644
--- a/llvm/test/CodeGen/BPF/adjust-opt-speculative1.ll
+++ b/llvm/test/CodeGen/BPF/adjust-opt-speculative1.ll
@@ -15,12 +15,12 @@
 ;   clang -target bpf -O2 -S -emit-llvm -Xclang -disable-llvm-passes test.c
 
 ; Function Attrs: nounwind
-define dso_local ptr @test(ptr %p) #0 {
+define dso_local ptr @test(ptr %p) {
 entry:
   %p.addr = alloca ptr, align 8
   %ret = alloca i64, align 8
   store ptr %p, ptr %p.addr, align 8, !tbaa !2
-  call void @llvm.lifetime.start.p0(i64 8, ptr %ret) #3
+  call void @llvm.lifetime.start.p0(i64 8, ptr %ret)
   %call = call i64 @foo()
   store i64 %call, ptr %ret, align 8, !tbaa !6
   %0 = load i64, ptr %ret, align 8, !tbaa !6
@@ -36,7 +36,7 @@ if.then:                                          ; preds = %entry
 
 if.end:                                           ; preds = %if.then, %entry
   %3 = load ptr, ptr %p.addr, align 8, !tbaa !2
-  call void @llvm.lifetime.end.p0(i64 8, ptr %ret) #3
+  call void @llvm.lifetime.end.p0(i64 8, ptr %ret)
   ret ptr %3
 }
 ; CHECK-COMMON:  [[REG6:r[0-9]+]] = r1
@@ -57,17 +57,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-COMMON:  exit
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
-declare dso_local i64 @foo(...) #2
+declare dso_local i64 @foo(...)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/adjust-opt-speculative2.ll b/llvm/test/CodeGen/BPF/adjust-opt-speculative2.ll
index 218fa5d..5f3fa94 100644
--- a/llvm/test/CodeGen/BPF/adjust-opt-speculative2.ll
+++ b/llvm/test/CodeGen/BPF/adjust-opt-speculative2.ll
@@ -15,12 +15,12 @@
 ;   clang -target bpf -O2 -S -emit-llvm -Xclang -disable-llvm-passes test.c
 
 ; Function Attrs: nounwind
-define dso_local ptr @test(ptr %p) #0 {
+define dso_local ptr @test(ptr %p) {
 entry:
   %p.addr = alloca ptr, align 8
   %ret = alloca i32, align 4
   store ptr %p, ptr %p.addr, align 8, !tbaa !2
-  call void @llvm.lifetime.start.p0(i64 4, ptr %ret) #3
+  call void @llvm.lifetime.start.p0(i64 4, ptr %ret)
   %call = call i32 @foo()
   store i32 %call, ptr %ret, align 4, !tbaa !6
   %0 = load i32, ptr %ret, align 4, !tbaa !6
@@ -37,7 +37,7 @@ if.then:                                          ; preds = %entry
 
 if.end:                                           ; preds = %if.then, %entry
   %3 = load ptr, ptr %p.addr, align 8, !tbaa !2
-  call void @llvm.lifetime.end.p0(i64 4, ptr %ret) #3
+  call void @llvm.lifetime.end.p0(i64 4, ptr %ret)
   ret ptr %3
 }
 
@@ -66,17 +66,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-COMMON:  exit
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
-declare dso_local i32 @foo(...) #2
+declare dso_local i32 @foo(...)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/callx.ll b/llvm/test/CodeGen/BPF/callx.ll
index d83e0f6..e027c1f 100644
--- a/llvm/test/CodeGen/BPF/callx.ll
+++ b/llvm/test/CodeGen/BPF/callx.ll
@@ -3,16 +3,13 @@
 ;   int test(int (*f)(void)) { return f(); }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr nocapture %f) local_unnamed_addr #0 {
+define dso_local i32 @test(ptr nocapture %f) local_unnamed_addr {
 entry:
-  %call = tail call i32 %f() #1
+  %call = tail call i32 %f()
 ; CHECK: callx r{{[0-9]+}}
   ret i32 %call
 }
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind }
-
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
 
diff --git a/llvm/test/CodeGen/BPF/dwarfdump.ll b/llvm/test/CodeGen/BPF/dwarfdump.ll
index a3a6b52..d50c025 100644
--- a/llvm/test/CodeGen/BPF/dwarfdump.ll
+++ b/llvm/test/CodeGen/BPF/dwarfdump.ll
@@ -10,7 +10,7 @@ target triple = "bpf"
 @testprog.myvar_c = internal unnamed_addr global i32 0, align 4, !dbg !0
 
 ; Function Attrs: nounwind
-define i32 @testprog(i32, i32) local_unnamed_addr #0 !dbg !2 {
+define i32 @testprog(i32, i32) local_unnamed_addr !dbg !2 {
   tail call void @llvm.dbg.value(metadata i32 %0, i64 0, metadata !11, metadata !16), !dbg !17
   tail call void @llvm.dbg.value(metadata i32 %1, i64 0, metadata !12, metadata !16), !dbg !18
   %3 = load i32, ptr @testprog.myvar_c, align 4, !dbg !19, !tbaa !20
@@ -21,10 +21,7 @@ define i32 @testprog(i32, i32) local_unnamed_addr #0 !dbg !2 {
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
 
 !llvm.dbg.cu = !{!7}
 !llvm.module.flags = !{!13, !14}
diff --git a/llvm/test/CodeGen/BPF/i128.ll b/llvm/test/CodeGen/BPF/i128.ll
index a966e3e..3c94e0c 100644
--- a/llvm/test/CodeGen/BPF/i128.ll
+++ b/llvm/test/CodeGen/BPF/i128.ll
@@ -19,14 +19,14 @@
 %struct.ipv6_key_t = type { i32, i128, i16 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(i32 %pid) local_unnamed_addr #0 {
+define dso_local i32 @test(i32 %pid) local_unnamed_addr {
 entry:
   %ipv6_key = alloca %struct.ipv6_key_t, align 16
-  call void @llvm.lifetime.start.p0(i64 48, ptr nonnull %ipv6_key) #4
+  call void @llvm.lifetime.start.p0(i64 48, ptr nonnull %ipv6_key)
   call void @llvm.memset.p0.i64(ptr nonnull align 16 dereferenceable(48) %ipv6_key, i8 0, i64 48, i1 false)
   store i32 %pid, ptr %ipv6_key, align 16, !tbaa !2
-  call void @test1(ptr nonnull %ipv6_key) #4
-  call void @llvm.lifetime.end.p0(i64 48, ptr nonnull %ipv6_key) #4
+  call void @test1(ptr nonnull %ipv6_key)
+  call void @llvm.lifetime.end.p0(i64 48, ptr nonnull %ipv6_key)
   ret i32 0
 }
 
@@ -35,21 +35,15 @@ entry:
 ; CHECK:       *(u32 *)(r10 - 48) = r{{[0-9]+}}
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: argmemonly nounwind willreturn writeonly
-declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #2
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)
 
-declare dso_local void @test1(ptr) local_unnamed_addr #3
+declare dso_local void @test1(ptr) local_unnamed_addr
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { argmemonly nounwind willreturn writeonly }
-attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind }
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/is_trunc_free.ll b/llvm/test/CodeGen/BPF/is_trunc_free.ll
index fe00731..6bb8568 100644
--- a/llvm/test/CodeGen/BPF/is_trunc_free.ll
+++ b/llvm/test/CodeGen/BPF/is_trunc_free.ll
@@ -29,7 +29,7 @@
 %struct.env_t = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %skb) local_unnamed_addr #0 {
+define dso_local i32 @test(ptr %skb) local_unnamed_addr {
 entry:
   %data_end1 = getelementptr inbounds %struct.env_t, ptr %skb, i64 0, i32 1
   %0 = load i32, ptr %data_end1, align 4, !tbaa !2
@@ -49,7 +49,7 @@ if.end10:                                         ; preds = %entry
   %sub.ptr.lhs.cast = ptrtoint ptr %add.ptr to i64
   %4 = trunc i64 %sub.ptr.lhs.cast to i32
   %conv13 = sub i32 %4, %2
-  %call = tail call i32 @work(ptr nonnull %skb, i32 %conv13) #2
+  %call = tail call i32 @work(ptr nonnull %skb, i32 %conv13)
   br label %cleanup
 
 cleanup:                                          ; preds = %entry, %if.end10
@@ -59,11 +59,7 @@ cleanup:                                          ; preds = %entry, %if.end10
 
 ; CHECK: w{{[0-9]+}} = *(u32 *)(r{{[0-9]+}} + 0)
 
-declare dso_local i32 @work(ptr, i32) local_unnamed_addr #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
+declare dso_local i32 @work(ptr, i32) local_unnamed_addr
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/is_zext_free.ll b/llvm/test/CodeGen/BPF/is_zext_free.ll
index 4b81a90..3b794a9 100644
--- a/llvm/test/CodeGen/BPF/is_zext_free.ll
+++ b/llvm/test/CodeGen/BPF/is_zext_free.ll
@@ -7,7 +7,7 @@
 ;   clang -target bpf -O2 -emit-llvm -S test.c
 
 ; Function Attrs: norecurse nounwind readnone
-define dso_local i32 @test(i64 %x, i64 %y) local_unnamed_addr #0 {
+define dso_local i32 @test(i64 %x, i64 %y) local_unnamed_addr {
 entry:
   %and = and i64 %y, %x
   %conv = trunc i64 %and to i32
@@ -17,8 +17,6 @@ entry:
 ; CHECK: r[[REG1:[0-9]+]] = r{{[0-9]+}}
 ; CHECK: w[[REG1]] &= w{{[0-9]+}}
 
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
 
diff --git a/llvm/test/CodeGen/BPF/objdump_two_funcs.ll b/llvm/test/CodeGen/BPF/objdump_two_funcs.ll
index fb1043c..8158a1b 100644
--- a/llvm/test/CodeGen/BPF/objdump_two_funcs.ll
+++ b/llvm/test/CodeGen/BPF/objdump_two_funcs.ll
@@ -14,7 +14,7 @@
 ;   clang -target bpf -S -gdwarf-5 -gembed-source -emit-llvm -g -O2 bug.c
 
 ; Function Attrs: norecurse nounwind readnone
-define dso_local i32 @func1(i32 %a) local_unnamed_addr #0 section "s1" !dbg !7 {
+define dso_local i32 @func1(i32 %a) local_unnamed_addr section "s1" !dbg !7 {
 entry:
 ; CHECK: <func1>:
   call void @llvm.dbg.value(metadata i32 %a, metadata !12, metadata !DIExpression()), !dbg !13
@@ -24,7 +24,7 @@ entry:
 }
 
 ; Function Attrs: norecurse nounwind readnone
-define dso_local i32 @func2(i32 %a) local_unnamed_addr #0 section "s2" !dbg !16 {
+define dso_local i32 @func2(i32 %a) local_unnamed_addr section "s2" !dbg !16 {
 entry:
 ; CHECK: <func2>:
   call void @llvm.dbg.value(metadata i32 %a, metadata !18, metadata !DIExpression()), !dbg !19
@@ -35,10 +35,7 @@ entry:
 }
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/optnone-1.ll b/llvm/test/CodeGen/BPF/optnone-1.ll
index 68046bf..f45c85b 100644
--- a/llvm/test/CodeGen/BPF/optnone-1.ll
+++ b/llvm/test/CodeGen/BPF/optnone-1.ll
@@ -5,7 +5,7 @@
 ;   clang -target bpf -g -S -emit-llvm test.c
 
 ; Function Attrs: noinline nounwind optnone
-define dso_local i32 @test(i32 %a, i32 %b) #0 !dbg !7 {
+define dso_local i32 @test(i32 %a, i32 %b) !dbg !7 {
 entry:
   %a.addr = alloca i32, align 4
   %b.addr = alloca i32, align 4
@@ -22,10 +22,7 @@ entry:
 ; CHECK-LABEL: test
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
-attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable}
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/reloc-btf-2.ll b/llvm/test/CodeGen/BPF/reloc-btf-2.ll
index 7398257..430abc7 100644
--- a/llvm/test/CodeGen/BPF/reloc-btf-2.ll
+++ b/llvm/test/CodeGen/BPF/reloc-btf-2.ll
@@ -14,7 +14,7 @@
 @s = internal global i32 0, align 4, !dbg !6
 
 ; Function Attrs: norecurse nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !14 {
+define dso_local i32 @test() local_unnamed_addr !dbg !14 {
   %1 = load i32, ptr @g, align 4, !dbg !17, !tbaa !18
   %2 = load volatile i32, ptr @s, align 4, !dbg !22, !tbaa !18
   %3 = add nsw i32 %2, %1, !dbg !23
@@ -27,8 +27,6 @@ define dso_local i32 @test() local_unnamed_addr #0 !dbg !14 {
 ; CHECK-RELOC: R_BPF_64_NODYLD32 g
 ; CHECK-RELOC: RELOCATION RECORDS FOR [.BTF.ext]:
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!10, !11, !12}
 !llvm.ident = !{!13}
diff --git a/llvm/test/CodeGen/BPF/reloc-btf.ll b/llvm/test/CodeGen/BPF/reloc-btf.ll
index b9f6e3a..875bfa1 100644
--- a/llvm/test/CodeGen/BPF/reloc-btf.ll
+++ b/llvm/test/CodeGen/BPF/reloc-btf.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple=bpfel -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
 
 ; Function Attrs: norecurse nounwind readnone
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test() local_unnamed_addr !dbg !7 {
 entry:
   ret i32 0, !dbg !11
 }
@@ -13,8 +13,6 @@ entry:
 ; CHECK-RELOC: RELOCATION RECORDS FOR [.BTF.ext]:
 ; CHECK-RELOC: R_BPF_64_NODYLD32 .text
 
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
diff --git a/llvm/test/CodeGen/BPF/simplifycfg.ll b/llvm/test/CodeGen/BPF/simplifycfg.ll
index fcd2321..d53b51a 100644
--- a/llvm/test/CodeGen/BPF/simplifycfg.ll
+++ b/llvm/test/CodeGen/BPF/simplifycfg.ll
@@ -38,15 +38,15 @@ target triple = "bpf"
 %struct.FrameData = type { ptr }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() #0 {
+define dso_local i32 @test() {
 entry:
   %frame_ptr = alloca ptr, align 8
   %frame = alloca %struct.FrameData, align 8
   %i = alloca i32, align 4
-  call void @llvm.lifetime.start.p0(i64 8, ptr %frame_ptr) #3
-  call void @llvm.lifetime.start.p0(i64 8, ptr %frame) #3
+  call void @llvm.lifetime.start.p0(i64 8, ptr %frame_ptr)
+  call void @llvm.lifetime.start.p0(i64 8, ptr %frame)
   call void @get_frame_ptr(ptr %frame_ptr)
-  call void @llvm.lifetime.start.p0(i64 4, ptr %i) #3
+  call void @llvm.lifetime.start.p0(i64 4, ptr %i)
   store i32 0, ptr %i, align 4, !tbaa !2
   br label %for.cond
 
@@ -61,7 +61,7 @@ for.cond:                                         ; preds = %for.inc, %entry
   br i1 %cmp, label %for.body, label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %for.cond
-  call void @llvm.lifetime.end.p0(i64 4, ptr %i) #3
+  call void @llvm.lifetime.end.p0(i64 4, ptr %i)
   br label %for.end
 
 for.body:                                         ; preds = %for.cond
@@ -93,25 +93,20 @@ for.end:                                          ; preds = %for.cond.cleanup
   %5 = load ptr, ptr %frame_ptr, align 8, !tbaa !6
   %cmp2 = icmp eq ptr %5, null
   %conv = zext i1 %cmp2 to i32
-  call void @llvm.lifetime.end.p0(i64 8, ptr %frame) #3
-  call void @llvm.lifetime.end.p0(i64 8, ptr %frame_ptr) #3
+  call void @llvm.lifetime.end.p0(i64 8, ptr %frame)
+  call void @llvm.lifetime.end.p0(i64 8, ptr %frame_ptr)
   ret i32 %conv
 }
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
-declare dso_local void @get_frame_ptr(ptr) #2
+declare dso_local void @get_frame_ptr(ptr)
 
-declare dso_local i32 @get_data(ptr, ptr) #2
+declare dso_local i32 @get_data(ptr, ptr)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/warn-stack.ll b/llvm/test/CodeGen/BPF/warn-stack.ll
index 58a6e4c..5e62a91 100644
--- a/llvm/test/CodeGen/BPF/warn-stack.ll
+++ b/llvm/test/CodeGen/BPF/warn-stack.ll
@@ -1,43 +1,37 @@
 ; RUN: not llc -mtriple=bpfel < %s 2>&1 >/dev/null | FileCheck %s
 
 ;; CHECK-NOT: nowarn
-define void @nowarn() local_unnamed_addr #0 !dbg !6 {
+define void @nowarn() local_unnamed_addr !dbg !6 {
   %1 = alloca [504 x i8], align 1
-  call void @llvm.lifetime.start.p0(i64 504, ptr nonnull %1) #4, !dbg !15
+  call void @llvm.lifetime.start.p0(i64 504, ptr nonnull %1), !dbg !15
   tail call void @llvm.dbg.declare(metadata ptr %1, metadata !10, metadata !16), !dbg !17
-  call void @doit(ptr nonnull %1) #4, !dbg !18
-  call void @llvm.lifetime.end.p0(i64 504, ptr nonnull %1) #4, !dbg !19
+  call void @doit(ptr nonnull %1), !dbg !18
+  call void @llvm.lifetime.end.p0(i64 504, ptr nonnull %1), !dbg !19
   ret void, !dbg !19
 }
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
-declare void @doit(ptr) local_unnamed_addr #3
+declare void @doit(ptr) local_unnamed_addr
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
 
 ; CHECK: error: warn_stack.c
 ; CHECK: BPF stack limit
-define void @warn() local_unnamed_addr #0 !dbg !20 {
+define void @warn() local_unnamed_addr !dbg !20 {
   %1 = alloca [512 x i8], align 1
-  call void @llvm.lifetime.start.p0(i64 512, ptr nonnull %1) #4, !dbg !26
+  call void @llvm.lifetime.start.p0(i64 512, ptr nonnull %1), !dbg !26
   tail call void @llvm.dbg.declare(metadata ptr %1, metadata !22, metadata !16), !dbg !27
-  call void @doit(ptr nonnull %1) #4, !dbg !28
-  call void @llvm.lifetime.end.p0(i64 512, ptr nonnull %1) #4, !dbg !29
+  call void @doit(ptr nonnull %1), !dbg !28
+  call void @llvm.lifetime.end.p0(i64 512, ptr nonnull %1), !dbg !29
   ret void, !dbg !29
 }
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind readnone }
-attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
 !llvm.ident = !{!5}
diff --git a/llvm/test/CodeGen/BPF/xadd.ll b/llvm/test/CodeGen/BPF/xadd.ll
index a3ec323..8d232ffb 100644
--- a/llvm/test/CodeGen/BPF/xadd.ll
+++ b/llvm/test/CodeGen/BPF/xadd.ll
@@ -17,7 +17,7 @@ target datalayout = "e-m:e-p:64:64-i64:64-n32:64-S128"
 target triple = "bpf"
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr nocapture %ptr) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr nocapture %ptr) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %ptr, metadata !13, metadata !DIExpression()), !dbg !15
   %0 = atomicrmw add ptr %ptr, i32 4 seq_cst, !dbg !16
@@ -28,10 +28,7 @@ entry:
 }
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/DirectX/CBufferAccess/unused.ll b/llvm/test/CodeGen/DirectX/CBufferAccess/unused.ll
new file mode 100644
index 0000000..6f1bbd0
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/CBufferAccess/unused.ll
@@ -0,0 +1,13 @@
+; RUN: opt -S -dxil-cbuffer-access -mtriple=dxil--shadermodel6.3-library %s | FileCheck %s
+; Check that we correctly ignore cbuffers that were nulled out by optimizations.
+
+%__cblayout_CB = type <{ float }>
+@CB.cb = local_unnamed_addr global target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 4, 0)) poison
+@x = external local_unnamed_addr addrspace(2) global float, align 4
+
+; CHECK-NOT: !hlsl.cbs =
+!hlsl.cbs = !{!0, !1, !2}
+
+!0 = !{ptr @CB.cb, ptr addrspace(2) @x}
+!1 = !{ptr @CB.cb, null}
+!2 = !{null, null}
diff --git a/llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll b/llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll
index 4f13f47..56798c8 100644
--- a/llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll
+++ b/llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll
@@ -28,6 +28,11 @@ define void @test() {
       @llvm.dx.resource.handlefrombinding(i32 0, i32 10, i32 1, i32 0, ptr @SB.str)
   ; CHECK: %"StructuredBuffer<struct.S>" = type { %struct.S }
 
+  ; StructuredBuffer<float[3][2]>
+  %struct1 = call target("dx.RawBuffer", [3 x [2 x float]], 0, 0)
+      @llvm.dx.resource.handlefrombinding(i32 0, i32 12, i32 1, i32 0, ptr null)
+  ; CHECK: %"StructuredBuffer<float[3][2]>" = type { [3 x [2 x float]] }
+
   ; ByteAddressBuffer
   %byteaddr = call target("dx.RawBuffer", i8, 0, 0)
       @llvm.dx.resource.handlefrombinding(i32 0, i32 20, i32 1, i32 0, ptr null)
@@ -40,12 +45,14 @@ define void @test() {
 ; CHECK-NEXT: @[[T1:.*]] = external constant %"Buffer<int32_t>"
 ; CHECK-NEXT: @[[T2:.*]] = external constant %"Buffer<uint32_t3>"
 ; CHECK-NEXT: @[[S0:.*]] = external constant %"StructuredBuffer<struct.S>"
+; CHECK-NEXT: @[[S1:.*]] = external constant %"StructuredBuffer<float[3][2]>"
 ; CHECK-NEXT: @[[B0:.*]] = external constant %ByteAddressBuffer
 
 ; CHECK: !{i32 0, ptr @[[T0]], !"A"
 ; CHECK: !{i32 1, ptr @[[T1]], !""
 ; CHECK: !{i32 2, ptr @[[T2]], !""
 ; CHECK: !{i32 3, ptr @[[S0]], !"SB"
-; CHECK: !{i32 4, ptr @[[B0]], !""
+; CHECK: !{i32 4, ptr @[[S1]], !""
+; CHECK: !{i32 5, ptr @[[B0]], !""
 
 attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) }
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/ripple_scalarize_scatter.ll b/llvm/test/CodeGen/Hexagon/autohvx/ripple_scalarize_scatter.ll
new file mode 100644
index 0000000..4385da3
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/ripple_scalarize_scatter.ll
@@ -0,0 +1,63 @@
+; Make sure we do not assert for the cases we do not handle.
+; RUN: llc -march=hexagon -mattr=+hvx,+hvx-length128b,+hvxv75,+v75,-long-calls < %s | FileCheck %s
+
+; Mainly make sure we do not core dump.
+; CHECK-NOT: scatter
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind memory(argmem: write, inaccessiblemem: readwrite)
+define dso_local void @foo(ptr noundef writeonly captures(none) %cptr, i32 noundef %T, i32 noundef %W) local_unnamed_addr #0 {
+entry:
+  %invariant.gep11 = getelementptr i8, ptr %cptr, i32 0
+  %invariant.gep13 = getelementptr i8, ptr %invariant.gep11, i32 0
+  %cmp.not15 = icmp ugt i32 8, %T
+  br i1 %cmp.not15, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry
+  %cmp3.not8 = icmp ugt i32 8, %W
+  %conv.ripple.LS.instance = trunc i32 %W to i8
+  %conv.ripple.LS.instance.ripple.bcast.splatinsert = insertelement <64 x i8> poison, i8 %conv.ripple.LS.instance, i64 0
+  %conv.ripple.LS.instance.ripple.bcast.splat = shufflevector <64 x i8> %conv.ripple.LS.instance.ripple.bcast.splatinsert, <64 x i8> poison, <64 x i32> zeroinitializer
+  br label %for.cond1.preheader
+
+for.cond.loopexit:                                ; preds = %for.body5, %for.cond1.preheader
+  %add = add i32 %add17, 8
+  %cmp.not = icmp ugt i32 %add, %T
+  br i1 %cmp.not, label %for.cond.cleanup, label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.lr.ph, %for.cond.loopexit
+  %add17 = phi i32 [ 8, %for.cond1.preheader.lr.ph ], [ %add, %for.cond.loopexit ]
+  %t.016 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %add17, %for.cond.loopexit ]
+  br i1 %cmp3.not8, label %for.cond.loopexit, label %for.body5.lr.ph
+
+for.body5.lr.ph:                                  ; preds = %for.cond1.preheader
+  %gep14 = getelementptr i8, ptr %invariant.gep13, i32 %t.016
+  br label %for.body5
+
+for.cond.cleanup:                                 ; preds = %for.cond.loopexit, %entry
+  ret void
+
+for.body5:                                        ; preds = %for.body5.lr.ph, %for.body5
+  %add210 = phi i32 [ 8, %for.body5.lr.ph ], [ %add2, %for.body5 ]
+  %w.09 = phi i32 [ 0, %for.body5.lr.ph ], [ %add210, %for.body5 ]
+  %gep = getelementptr i8, ptr %gep14, i32 %w.09
+  %gep.ripple.LS.instance = getelementptr i8, ptr %gep, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+  call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> %conv.ripple.LS.instance.ripple.bcast.splat, <64 x ptr> %gep.ripple.LS.instance, i32 1, <64 x i1> splat (i1 true))
+  %add2 = add i32 %add210, 8
+  %cmp3.not = icmp ugt i32 %add2, %W
+  br i1 %cmp3.not, label %for.cond.loopexit, label %for.body5
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write)
+declare void @llvm.ripple.block.setsize.i32(i32 immarg %0, i32 immarg %1, i32 %2) #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
+declare i32 @llvm.ripple.block.index.i32(i32 immarg %0, i32 immarg %1) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
+declare i32 @llvm.ripple.block.getsize.i32(i32 immarg %0, i32 immarg %1) #2
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(write)
+declare void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> %0, <64 x ptr> %1, i32 immarg %2, <64 x i1> %3) #3
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather.ll b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather.ll
new file mode 100644
index 0000000..83fd63e
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather.ll
@@ -0,0 +1,55 @@
+; RUN: llc -march=hexagon -mattr=+hvxv73,+hvx-length128b,-long-calls -hexagon-allow-scatter-gather-hvx < %s | FileCheck %s
+
+; CHECK-LABEL: Ripple_gather_32:
+; CHECK: vtmp.w = vgather
+; CHECK-LABEL: Ripple_gather_16:
+; CHECK: vtmp.h = vgather
+; CHECK-LABEL: Ripple_gather_8:
+; CHECK: vand
+; CHECK: vpacke
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+; Function Attrs: nofree noinline norecurse nosync nounwind memory(argmem: readwrite, inaccessiblemem: readwrite)
+define dso_local void @Ripple_gather_32(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 {
+entry:
+  %source.ripple.bcast.splatinsert = insertelement <32 x ptr> poison, ptr %source, i64 0
+  %source.ripple.bcast.splat = shufflevector <32 x ptr> %source.ripple.bcast.splatinsert, <32 x ptr> poison, <32 x i32> zeroinitializer
+  %0 = load <32 x i32>, ptr %indexes, align 4
+  %arrayidx2.ripple.vectorized = getelementptr inbounds i32, <32 x ptr> %source.ripple.bcast.splat, <32 x i32> %0
+  %1 = tail call <32 x i32> @llvm.masked.gather.v32i32.v32p0(<32 x ptr> %arrayidx2.ripple.vectorized, i32 4, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i32> poison)
+  store <32 x i32> %1, ptr %destination, align 4
+  ret void
+}
+
+; Function Attrs: nofree noinline norecurse nosync nounwind memory(argmem: readwrite, inaccessiblemem: readwrite)
+define dso_local void @Ripple_gather_16(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 {
+entry:
+  %source.ripple.bcast.splatinsert = insertelement <64 x ptr> poison, ptr %source, i64 0
+  %source.ripple.bcast.splat = shufflevector <64 x ptr> %source.ripple.bcast.splatinsert, <64 x ptr> poison, <64 x i32> zeroinitializer
+  %0 = load <64 x i16>, ptr %indexes, align 2
+  %idxprom.ripple.vectorized = zext <64 x i16> %0 to <64 x i32>
+  %arrayidx2.ripple.vectorized = getelementptr inbounds i16, <64 x ptr> %source.ripple.bcast.splat, <64 x i32> %idxprom.ripple.vectorized
+  %1 = tail call <64 x i16> @llvm.masked.gather.v64i16.v64p0(<64 x ptr> %arrayidx2.ripple.vectorized, i32 2, <64 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <64 x i16> poison)
+  store <64 x i16> %1, ptr %destination, align 2
+  ret void
+}
+
+; Function Attrs: nofree noinline norecurse nosync nounwind memory(argmem: readwrite, inaccessiblemem: readwrite)
+define dso_local void @Ripple_gather_8(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 {
+entry:
+  %source.ripple.bcast.splatinsert = insertelement <128 x ptr> poison, ptr %source, i64 0
+  %source.ripple.bcast.splat = shufflevector <128 x ptr> %source.ripple.bcast.splatinsert, <128 x ptr> poison, <128 x i32> zeroinitializer
+  %0 = load <128 x i8>, ptr %indexes, align 1
+  %idxprom.ripple.vectorized = zext <128 x i8> %0 to <128 x i32>
+  %arrayidx2.ripple.vectorized = getelementptr inbounds i8, <128 x ptr> %source.ripple.bcast.splat, <128 x i32> %idxprom.ripple.vectorized
+  %1 = tail call <128 x i8> @llvm.masked.gather.v128i8.v128p0(<128 x ptr> %arrayidx2.ripple.vectorized, i32 1, <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <128 x i8> poison)
+  store <128 x i8> %1, ptr %destination, align 1
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
+declare <32 x i32> @llvm.masked.gather.v32i32.v32p0(<32 x ptr>, i32 immarg, <32 x i1>, <32 x i32>) #1
+declare <64 x i16> @llvm.masked.gather.v64i16.v64p0(<64 x ptr>, i32 immarg, <64 x i1>, <64 x i16>) #1
+declare <128 x i8> @llvm.masked.gather.v128i8.v128p0(<128 x ptr> %0, i32 immarg %1, <128 x i1> %2, <128 x i8> %3) #1
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather_SpVV.ll b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather_SpVV.ll
new file mode 100644
index 0000000..1bd79d7
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather_SpVV.ll
@@ -0,0 +1,54 @@
+; Verify that we generate HVX vgather for the given input.
+; RUN: llc -march=hexagon -mattr=+hvxv73,+hvx-length128b,-long-calls -hexagon-allow-scatter-gather-hvx < %s | FileCheck %s
+; CHECK-LABEL: SpVV_Ripple:
+; CHECK: vtmp.h = vgather(r{{[0-9]+}},m0,v{{[0-9]+}}.h).h
+; CHECK: vmem(r0+#0) = vtmp.new
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+define dso_local i32 @SpVV_Ripple(ptr nocapture noundef writeonly %scratchpad, ptr nocapture noundef readonly %Source_1, ptr nocapture noundef readonly %S_index, i32 noundef %nS, ptr nocapture noundef readonly %Source_2) local_unnamed_addr #1 {
+entry:
+  %Source_2.ripple.bcast.splatinsert = insertelement <64 x ptr> poison, ptr %Source_2, i64 0
+  %Source_2.ripple.bcast.splat = shufflevector <64 x ptr> %Source_2.ripple.bcast.splatinsert, <64 x ptr> poison, <64 x i32> zeroinitializer
+  %div16 = lshr i32 %nS, 6
+  %cmp6.not = icmp ult i32 %nS, 64
+  br i1 %cmp6.not, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %lsr.iv17 = phi ptr [ %scevgep18, %for.body ], [ %S_index, %entry ]
+  %lsr.iv = phi ptr [ %scevgep, %for.body ], [ %Source_1, %entry ]
+  %result.08.ripple.vectorized = phi <64 x i32> [ %add8.ripple.vectorized, %for.body ], [ zeroinitializer, %entry ]
+  %_ripple_block_0.07 = phi i32 [ %add9, %for.body ], [ 0, %entry ]
+  %.ripple.LS.instance = load <64 x i16>, ptr %lsr.iv17, align 2
+  %idxprom.ripple.LS.instance = sext <64 x i16> %.ripple.LS.instance to <64 x i32>
+  %arrayidx2.ripple.LS.instance = getelementptr inbounds i16, <64 x ptr> %Source_2.ripple.bcast.splat, <64 x i32> %idxprom.ripple.LS.instance
+  %.ripple.LS.instance13 = tail call <64 x i16> @llvm.masked.gather.v64i16.v64p0(<64 x ptr> %arrayidx2.ripple.LS.instance, i32 2, <64 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <64 x i16> poison)
+  store <64 x i16> %.ripple.LS.instance13, ptr %scratchpad, align 2
+  %.ripple.LS.instance15 = load <64 x i16>, ptr %lsr.iv, align 2
+  %conv.ripple.LS.instance = sext <64 x i16> %.ripple.LS.instance15 to <64 x i32>
+  %conv6.ripple.LS.instance = sext <64 x i16> %.ripple.LS.instance13 to <64 x i32>
+  %mul7.ripple.LS.instance = mul nsw <64 x i32> %conv.ripple.LS.instance, %conv6.ripple.LS.instance
+  %add8.ripple.vectorized = add <64 x i32> %mul7.ripple.LS.instance, %result.08.ripple.vectorized
+  %add9 = add nuw nsw i32 %_ripple_block_0.07, 1
+  %scevgep = getelementptr i8, ptr %lsr.iv, i32 128
+  %scevgep18 = getelementptr i8, ptr %lsr.iv17, i32 128
+  %cmp = icmp ult i32 %add9, %div16
+  br i1 %cmp, label %for.body, label %for.end
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa.ripple.LS.instance = phi <64 x i32> [ zeroinitializer, %entry ], [ %add8.ripple.vectorized, %for.body ]
+  %rdx.shuf = shufflevector <64 x i32> %result.0.lcssa.ripple.LS.instance, <64 x i32> poison, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %bin.rdx = add <64 x i32> %result.0.lcssa.ripple.LS.instance, %rdx.shuf
+  %rdx.shuf19 = shufflevector <64 x i32> %bin.rdx, <64 x i32> poison, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %bin.rdx20 = add <64 x i32> %bin.rdx, %rdx.shuf19
+  %rdx.shuf21 = shufflevector <64 x i32> %bin.rdx20, <64 x i32> poison, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %bin.rdx22 = add <64 x i32> %bin.rdx20, %rdx.shuf21
+  %rdx.shuf23 = shufflevector <64 x i32> %bin.rdx22, <64 x i32> poison, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %bin.rdx24 = add <64 x i32> %bin.rdx22, %rdx.shuf23
+  %rdx.shuf25 = shufflevector <64 x i32> %bin.rdx24, <64 x i32> poison, <64 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %bin.rdx26 = add <64 x i32> %bin.rdx24, %rdx.shuf25
+  %rdx.shuf27 = shufflevector <64 x i32> %bin.rdx26, <64 x i32> poison, <64 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %bin.rdx28 = add <64 x i32> %bin.rdx26, %rdx.shuf27
+  %0 = extractelement <64 x i32> %bin.rdx28, i32 0
+  ret i32 %0
+}
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/ripple_vscatter.ll b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vscatter.ll
new file mode 100644
index 0000000..85d2999
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vscatter.ll
@@ -0,0 +1,52 @@
+; RUN: llc -march=hexagon -mattr=+hvx-length128b,+hvxv73,+v73,-long-calls -hexagon-allow-scatter-gather-hvx < %s | FileCheck %s
+
+; CHECK-LABEL: Ripple_scatter_8:
+; CHECK: if (q{{[0-9]+}}) vscatter(r{{[0-9]+}},m0,v{{[0-9]+}}.h).h
+; CHECK: if (q{{[0-9]+}}) vscatter(r{{[0-9]+}},m0,v{{[0-9]+}}.h).h
+; CHECK-LABEL: Ripple_scatter_16:
+; CHECK: vscatter(r{{[0-9]+}},m0,v{{[0-9]+}}.h).h = v{{[0-9]+}}
+; CHECK-LABEL: Ripple_scatter_32:
+; CHECK: vscatter(r{{[0-9]+}},m0,v{{[0-9]+}}.w).w = v{{[0-9]+}}
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+define dso_local void @Ripple_scatter_8(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 {
+entry:
+  %destination.ripple.bcast.splatinsert = insertelement <128 x ptr> poison, ptr %destination, i64 0
+  %destination.ripple.bcast.splat = shufflevector <128 x ptr> %destination.ripple.bcast.splatinsert, <128 x ptr> poison, <128 x i32> zeroinitializer
+  %.ripple.LS.instance11 = load <128 x i8>, ptr %source, align 1
+  %.ripple.LS.instance = load <128 x i8>, ptr %indexes, align 1
+  %idxprom.ripple.LS.instance = zext <128 x i8> %.ripple.LS.instance to <128 x i32>
+  %arrayidx3.ripple.LS.instance = getelementptr inbounds i8, <128 x ptr> %destination.ripple.bcast.splat, <128 x i32> %idxprom.ripple.LS.instance
+  %cst_ptr_to_i32 = ptrtoint ptr %destination to i32
+  tail call void @llvm.masked.scatter.v128i8.v128p0(<128 x i8> %.ripple.LS.instance11, <128 x ptr> %arrayidx3.ripple.LS.instance, i32 1, <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  ret void
+}
+
+define dso_local void @Ripple_scatter_16(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 {
+entry:
+  %destination.ripple.bcast.splatinsert = insertelement <64 x ptr> poison, ptr %destination, i64 0
+  %destination.ripple.bcast.splat = shufflevector <64 x ptr> %destination.ripple.bcast.splatinsert, <64 x ptr> poison, <64 x i32> zeroinitializer
+  %.ripple.LS.instance11 = load <64 x i16>, ptr %source, align 2
+  %.ripple.LS.instance = load <64 x i16>, ptr %indexes, align 2
+  %idxprom.ripple.LS.instance = zext <64 x i16> %.ripple.LS.instance to <64 x i32>
+  %arrayidx3.ripple.LS.instance = getelementptr inbounds i16, <64 x ptr> %destination.ripple.bcast.splat, <64 x i32> %idxprom.ripple.LS.instance
+  tail call void @llvm.masked.scatter.v64i16.v64p0(<64 x i16> %.ripple.LS.instance11, <64 x ptr> %arrayidx3.ripple.LS.instance, i32 2, <64 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  ret void
+}
+
+define dso_local void @Ripple_scatter_32(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 {
+entry:
+  %destination.ripple.bcast.splatinsert = insertelement <32 x ptr> poison, ptr %destination, i64 0
+  %destination.ripple.bcast.splat = shufflevector <32 x ptr> %destination.ripple.bcast.splatinsert, <32 x ptr> poison, <32 x i32> zeroinitializer
+  %.ripple.LS.instance11 = load <32 x i32>, ptr %source, align 4
+  %.ripple.LS.instance = load <32 x i32>, ptr %indexes, align 4
+  %arrayidx3.ripple.LS.instance = getelementptr inbounds i32, <32 x ptr> %destination.ripple.bcast.splat, <32 x i32> %.ripple.LS.instance
+  tail call void @llvm.masked.scatter.v32i32.v32p0(<32 x i32> %.ripple.LS.instance11, <32 x ptr> %arrayidx3.ripple.LS.instance, i32 4, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  ret void
+}
+
+declare void @llvm.masked.scatter.v128i8.v128p0(<128 x i8> %0, <128 x ptr> %1, i32 immarg %2, <128 x i1> %3) #2
+declare void @llvm.masked.scatter.v64i16.v64p0(<64 x i16> %0, <64 x ptr> %1, i32 immarg %2, <64 x i1> %3) #2
+declare void @llvm.masked.scatter.v32i32.v32p0(<32 x i32> %0, <32 x ptr> %1, i32 immarg %2, <32 x i1> %3) #2
diff --git a/llvm/test/CodeGen/Hexagon/masked_gather.ll b/llvm/test/CodeGen/Hexagon/masked_gather.ll
new file mode 100644
index 0000000..461fd79
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/masked_gather.ll
@@ -0,0 +1,58 @@
+; This produced masked gather that we are not yet handling
+; REQUIRES: asserts
+; RUN: opt -march=hexagon -passes=loop-vectorize -hexagon-autohvx -mattr=+hvx-length128b,+hvxv68,+v68,+hvx-ieee-fp,-long-calls,-packets -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s
+
+; Original C++
+; clang -c -Os -mhvx -mhvx-ieee-fp -fvectorize -mno-packets -fno-strict-aliasing -Os -mhvx -mhvx-ieee-fp  -mno-packets -mv68
+;typedef struct poptContext_s * poptContext;
+;typedef struct { unsigned int bits[1]; } pbm_set;
+;struct poptContext_s { pbm_set * arg_strip; };
+;
+;int poptStrippedArgv(poptContext con, int argc, char ** argv) {
+;  int numargs = argc;
+;   for (int i = 1; i < argc; i++) {
+;     if (((((con->arg_strip)->bits)[((i) / (8 * sizeof (unsigned int)))] & ((unsigned int) 1 << ((i) % (8 * sizeof (unsigned int))))) != 0))
+;     numargs--;
+;   }
+;    return numargs;
+;}
+
+; CHECK-NOT: masked_gather
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon-unknown-unknown-elf"
+
+; Function Attrs: nofree norecurse nosync nounwind optsize memory(read, inaccessiblemem: none)
+define dso_local i32 @poptStrippedArgv(ptr noundef readonly captures(none) %con, i32 noundef %argc, ptr noundef readnone captures(none) %argv) local_unnamed_addr #0 {
+entry:
+  %cmp8 = icmp sgt i32 %argc, 1
+  br i1 %cmp8, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %0 = load ptr, ptr %con, align 4
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %spec.select.lcssa = phi i32 [ %spec.select, %for.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %numargs.0.lcssa = phi i32 [ %argc, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
+  ret i32 %numargs.0.lcssa
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.010 = phi i32 [ 1, %for.body.lr.ph ], [ %inc, %for.body ]
+  %numargs.09 = phi i32 [ %argc, %for.body.lr.ph ], [ %spec.select, %for.body ]
+  %div7 = lshr i32 %i.010, 5
+  %arrayidx = getelementptr inbounds nuw [1 x i32], ptr %0, i32 0, i32 %div7
+  %1 = load i32, ptr %arrayidx, align 4
+  %rem = and i32 %i.010, 31
+  %shl = shl nuw i32 1, %rem
+  %and = and i32 %1, %shl
+  %cmp1.not = icmp ne i32 %and, 0
+  %dec = sext i1 %cmp1.not to i32
+  %spec.select = add nsw i32 %numargs.09, %dec
+  %inc = add nuw nsw i32 %i.010, 1
+  %exitcond.not = icmp eq i32 %inc, %argc
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
diff --git a/llvm/test/CodeGen/Hexagon/vector-gather.ll b/llvm/test/CodeGen/Hexagon/vector-gather.ll
new file mode 100644
index 0000000..5700380
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vector-gather.ll
@@ -0,0 +1,27 @@
+; REQUIRES: hexagon-registered-target
+; RUN: llc -march=hexagon -mcpu=hexagonv73 -mattr=+hvxv73,+hvx-length128b < %s | FileCheck %s
+
+target triple = "hexagon"
+
+@VTCM_SCATTER16_ADDRESS = dso_local global i32 0, align 4
+@region_len = dso_local global i32 16383, align 4
+
+; CHECK: [[ADR:r[0-9]+]] = memw(gp+#VTCM_SCATTER16_ADDRESS)
+; CHECK: vtmp.h = vgather([[ADR]],m0,v0.h).h
+; CHECK: vmem(r0+#0) = vtmp.new
+
+define dso_local void @vector_gather_16(ptr noundef %vgather, <32 x i32> noundef %offsets) #0 {
+entry:
+  %vgather.addr = alloca ptr, align 4
+  %offsets.addr = alloca <32 x i32>, align 128
+  store ptr %vgather, ptr %vgather.addr, align 4
+  store <32 x i32> %offsets, ptr %offsets.addr, align 128
+  %0 = load ptr, ptr %vgather.addr, align 4
+  %1 = load i32, ptr @VTCM_SCATTER16_ADDRESS, align 4
+  %2 = load i32, ptr @region_len, align 4
+  %3 = load <32 x i32>, ptr %offsets.addr, align 128
+  call void @llvm.hexagon.V6.vgathermh.128B(ptr %0, i32 %1, i32 %2, <32 x i32> %3)
+  ret void
+}
+
+declare <128 x i1> @llvm.hexagon.V6.vandvrt.128B(<32 x i32>, i32)
diff --git a/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll b/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll
index 245f764..7149cdb 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll
@@ -32,9 +32,7 @@ define <16 x i16> @shuffle_v16i16(<16 x i16> %a) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
 ; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI2_0)
-; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 78
-; CHECK-NEXT:    xvshuf.w $xr1, $xr2, $xr0
-; CHECK-NEXT:    xvori.b $xr0, $xr1, 0
+; CHECK-NEXT:    xvperm.w $xr0, $xr0, $xr1
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> poison, <16 x i32> <i32 8, i32 9, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i16> %shuffle
@@ -55,9 +53,7 @@ define <16 x i16> @shuffle_v16i16_same_lane(<16 x i16> %a) {
 define <8 x i32> @shuffle_v8i32(<8 x i32> %a) {
 ; CHECK-LABEL: shuffle_v8i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_0)
-; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI4_0)
-; CHECK-NEXT:    xvperm.w $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 226
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> poison, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i32> %shuffle
@@ -93,9 +89,7 @@ define <4 x i64> @shuffle_v4i64_same_lane(<4 x i64> %a) {
 define <8 x float> @shuffle_v8f32(<8 x float> %a) {
 ; CHECK-LABEL: shuffle_v8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_0)
-; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI8_0)
-; CHECK-NEXT:    xvperm.w $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 226
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x float> %a, <8 x float> poison, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7>
   ret <8 x float> %shuffle
diff --git a/llvm/test/CodeGen/MIR/AArch64/parse-shufflemask-invalid-scalar.mir b/llvm/test/CodeGen/MIR/AArch64/parse-shufflemask-invalid-scalar.mir
new file mode 100644
index 0000000..6c6f9b6
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/AArch64/parse-shufflemask-invalid-scalar.mir
@@ -0,0 +1,15 @@
+# RUN: not llc -mtriple=aarch64-- -run-pass=none -o /dev/null %s 2>&1 | FileCheck %s
+
+---
+name: test_shuffle_0
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $d0
+    %0:_(<2 x s32>) = COPY $d0
+    %2:_(<2 x s32>) = G_IMPLICIT_DEF
+    ; CHECK: [[@LINE+1]]:67: shufflemask should have > 1 element
+    %1:_(s32) = G_SHUFFLE_VECTOR %0(<2 x s32>), %2, shufflemask(0)
+    $w0 = COPY %1
+    RET_ReallyLR implicit $w0
+...
diff --git a/llvm/test/CodeGen/MIR/AArch64/parse-shufflemask.mir b/llvm/test/CodeGen/MIR/AArch64/parse-shufflemask.mir
index c529d63..4cc3bc6 100644
--- a/llvm/test/CodeGen/MIR/AArch64/parse-shufflemask.mir
+++ b/llvm/test/CodeGen/MIR/AArch64/parse-shufflemask.mir
@@ -122,54 +122,3 @@ body:             |
     RET_ReallyLR implicit $d0
 
 ...
-
-# CHECK-LABEL: name: test_shuffle_0
-# CHECK: G_SHUFFLE_VECTOR %0(<2 x s32>), %1, shufflemask(0)
----
-name: test_shuffle_0
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    liveins: $d0
-
-    %0:_(<2 x s32>) = COPY $d0
-    %2:_(<2 x s32>) = G_IMPLICIT_DEF
-    %1:_(s32) = G_SHUFFLE_VECTOR %0(<2 x s32>), %2, shufflemask(0)
-    $w0 = COPY %1
-    RET_ReallyLR implicit $w0
-
-...
-
-# CHECK-LABEL: name: test_shuffle_1
-# CHECK: G_SHUFFLE_VECTOR %0(<2 x s32>), %1, shufflemask(1)
----
-name: test_shuffle_1
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    liveins: $d0
-
-    %0:_(<2 x s32>) = COPY $d0
-    %2:_(<2 x s32>) = G_IMPLICIT_DEF
-    %1:_(s32) = G_SHUFFLE_VECTOR %0(<2 x s32>), %2, shufflemask(1)
-    $w0 = COPY %1
-    RET_ReallyLR implicit $w0
-
-...
-
-# CHECK-LABEL: name: test_shuffle_undef
-# CHECK: G_SHUFFLE_VECTOR %0(<2 x s32>), %1, shufflemask(undef)
----
-name: test_shuffle_undef
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    liveins: $d0
-
-    %0:_(<2 x s32>) = COPY $d0
-    %2:_(<2 x s32>) = G_IMPLICIT_DEF
-    %1:_(s32) = G_SHUFFLE_VECTOR %0(<2 x s32>), %2, shufflemask(undef)
-    $w0 = COPY %1
-    RET_ReallyLR implicit $w0
-
-...
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_2D_vocab.json b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_2D_vocab.json
index 2894fff..da0d13d 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_2D_vocab.json
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_2D_vocab.json
@@ -1,5 +1,5 @@
 {
-    "entities" : {
+    "Opcodes" : {
         "ABS_Fp":[1, 2],
         "ADC":[3, 4],
         "ADD":[5, 6],
@@ -7,5 +7,21 @@
         "ADDPDrr":[9, 10],
         "ADDPSrr":[11, 12],
         "ADDSDrm":[13, 14]
+    },
+    "CommonOperands": {
+        "Immediate": [0.1, 0.1],
+        "MBB": [0.2, 0.2],
+        "FrameIndex": [0.3, 0.3],
+        "GlobalAddress": [0.4, 0.4]
+    },
+    "PhysicalRegisters": {
+        "GR32": [0.5, 0.5],
+        "GR64": [0.6, 0.6],
+        "XMM": [0.7, 0.7]
+    },
+    "VirtualRegisters": {
+        "GR32": [0.8, 0.8],
+        "GR64": [0.9, 0.9],
+        "XMM": [1.0, 1.0]
     }
 }
 \ No newline at end of file
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json
index 5de715b..f4b14a4 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json
@@ -1,5 +1,5 @@
 {
-  "entities": {
+  "Opcodes": {
     "KILL": [0.1, 0.2, 0.3],
     "MOV": [0.4, 0.5, 0.6],
     "LEA": [0.7, 0.8, 0.9],
@@ -18,5 +18,21 @@
     "POP": [4.6, 4.7, 4.8],
     "NOP": [4.9, 5.0, 5.1],
     "COPY": [5.2, 5.3, 5.4]
+  },
+  "CommonOperands": {
+    "Immediate": [0.1, 0.1, 0.1],
+    "MBB": [0.2, 0.2, 0.2],
+    "FrameIndex": [0.3, 0.3, 0.3],
+    "GlobalAddress": [0.4, 0.4, 0.4]
+  },
+  "PhysicalRegisters": {
+    "GR32": [0.5, 0.5, 0.5],
+    "GR64": [0.6, 0.6, 0.6],
+    "XMM": [0.7, 0.7, 0.7]
+  },
+  "VirtualRegisters": {
+    "GR32": [0.8, 0.8, 0.8],
+    "GR64": [0.9, 0.9, 0.9],
+    "XMM": [1.0, 1.0, 1.0]
   }
 }
 \ No newline at end of file
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_inconsistent_dims.json b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_inconsistent_dims.json
index bf04163..6274fb7 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_inconsistent_dims.json
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_inconsistent_dims.json
@@ -1,7 +1,16 @@
 {
-    "entities": {
+    "Opcodes": {
         "ADD": [1.0, 2.0, 3.0],
         "SUB": [1.5],
         "MUL": [2.0, 3.0]
+    },
+    "CommonOperands": {
+        "Immediate": [1.0]
+    },
+    "PhysicalRegisters": {
+        "GR32": [1.0, 2.0]
+    },
+    "VirtualRegisters": {
+        "GR32": [1.0, 2.0, 3.0]
     }
 }
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_zero_vocab.json b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_zero_vocab.json
index 63e8ccbd..7bfdf3b 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_zero_vocab.json
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_zero_vocab.json
@@ -1,5 +1,5 @@
 {
-    "entities": {
+    "Opcodes": {
         "ADD": [],
         "SUB": [],
         "MUL": [],
@@ -8,5 +8,14 @@
         "JMP": [],
         "CALL": [],
         "RET": []
+    },
+    "CommonOperands": {
+        "Immediate": []
+    },
+    "PhysicalRegisters": {
+        "GR32": []
+    },
+    "VirtualRegisters": {
+        "GR32": []
     }
 }
 \ No newline at end of file
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt
index 6327cff..d3c0da9 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt
@@ -6880,3 +6880,294 @@ Key: XSHA:  [ 0.00  0.00 ]
 Key: XSTORE:  [ 0.00  0.00 ]
 Key: XSUSLDTRK:  [ 0.00  0.00 ]
 Key: XTEST:  [ 0.00  0.00 ]
+Key: Immediate:  [ 0.10  0.10 ]
+Key: CImmediate:  [ 0.00  0.00 ]
+Key: FPImmediate:  [ 0.00  0.00 ]
+Key: MBB:  [ 0.20  0.20 ]
+Key: FrameIndex:  [ 0.30  0.30 ]
+Key: ConstantPoolIndex:  [ 0.00  0.00 ]
+Key: TargetIndex:  [ 0.00  0.00 ]
+Key: JumpTableIndex:  [ 0.00  0.00 ]
+Key: ExternalSymbol:  [ 0.00  0.00 ]
+Key: GlobalAddress:  [ 0.40  0.40 ]
+Key: BlockAddress:  [ 0.00  0.00 ]
+Key: RegisterMask:  [ 0.00  0.00 ]
+Key: RegisterLiveOut:  [ 0.00  0.00 ]
+Key: Metadata:  [ 0.00  0.00 ]
+Key: MCSymbol:  [ 0.00  0.00 ]
+Key: CFIIndex:  [ 0.00  0.00 ]
+Key: IntrinsicID:  [ 0.00  0.00 ]
+Key: Predicate:  [ 0.00  0.00 ]
+Key: ShuffleMask:  [ 0.00  0.00 ]
+Key: PhyReg_GR8:  [ 0.00  0.00 ]
+Key: PhyReg_GRH8:  [ 0.00  0.00 ]
+Key: PhyReg_GR8_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_GR8_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_GR8_ABCD_H:  [ 0.00  0.00 ]
+Key: PhyReg_GR8_ABCD_L:  [ 0.00  0.00 ]
+Key: PhyReg_GRH16:  [ 0.00  0.00 ]
+Key: PhyReg_GR16:  [ 0.00  0.00 ]
+Key: PhyReg_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_GR16_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_VK1:  [ 0.00  0.00 ]
+Key: PhyReg_VK16:  [ 0.00  0.00 ]
+Key: PhyReg_VK2:  [ 0.00  0.00 ]
+Key: PhyReg_VK4:  [ 0.00  0.00 ]
+Key: PhyReg_VK8:  [ 0.00  0.00 ]
+Key: PhyReg_VK16WM:  [ 0.00  0.00 ]
+Key: PhyReg_VK1WM:  [ 0.00  0.00 ]
+Key: PhyReg_VK2WM:  [ 0.00  0.00 ]
+Key: PhyReg_VK4WM:  [ 0.00  0.00 ]
+Key: PhyReg_VK8WM:  [ 0.00  0.00 ]
+Key: PhyReg_SEGMENT_REG:  [ 0.00  0.00 ]
+Key: PhyReg_GR16_ABCD:  [ 0.00  0.00 ]
+Key: PhyReg_FPCCR:  [ 0.00  0.00 ]
+Key: PhyReg_FR16X:  [ 0.00  0.00 ]
+Key: PhyReg_FR16:  [ 0.00  0.00 ]
+Key: PhyReg_VK16PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK1PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK2PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK4PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK8PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK1PAIR_with_sub_mask_0_in_VK1WM:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_FR32X:  [ 0.00  0.00 ]
+Key: PhyReg_GR32:  [ 0.50  0.50 ]
+Key: PhyReg_GR32_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_DEBUG_REG:  [ 0.00  0.00 ]
+Key: PhyReg_FR32:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_NOREX2_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_VK32:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_NOREX_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_RFP32:  [ 0.00  0.00 ]
+Key: PhyReg_VK32WM:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ABCD:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ABCD_and_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_AD:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BPSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BSI:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_CB:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_DC:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_DIBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit:  [ 0.00  0.00 ]
+Key: PhyReg_CCR:  [ 0.00  0.00 ]
+Key: PhyReg_DFCCR:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ABCD_and_GR32_BSI:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_AD_and_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ArgRef_and_GR32_CB:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BPSP_and_GR32_DIBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BPSP_and_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BSI_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_DIBP_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_with_sub_32bit:  [ 0.00  0.00 ]
+Key: PhyReg_RFP64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64:  [ 0.60  0.60 ]
+Key: PhyReg_FR64X:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_CONTROL_REG:  [ 0.00  0.00 ]
+Key: PhyReg_FR64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_16bit_in_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX2_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64PLTSafe:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX2_NOSP_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TCW64_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_VK64:  [ 0.00  0.00 ]
+Key: PhyReg_VR64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64PLTSafe_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX2_NOSP_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TCW64_and_GR64_TC_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_VK64WM:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_GR64PLTSafe_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_ABCD:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_AD:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_and_LOW32_ADDR_ACCESS_RBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BSI:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_CB:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_DIBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_A:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_ArgRef_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_and_LOW32_ADDR_ACCESS:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_RST:  [ 0.00  0.00 ]
+Key: PhyReg_RFP80:  [ 0.00  0.00 ]
+Key: PhyReg_RFP80_7:  [ 0.00  0.00 ]
+Key: PhyReg_VR128X:  [ 0.00  0.00 ]
+Key: PhyReg_VR128:  [ 0.00  0.00 ]
+Key: PhyReg_VR256X:  [ 0.00  0.00 ]
+Key: PhyReg_VR256:  [ 0.00  0.00 ]
+Key: PhyReg_VR512:  [ 0.00  0.00 ]
+Key: PhyReg_VR512_0_15:  [ 0.00  0.00 ]
+Key: PhyReg_TILE:  [ 0.00  0.00 ]
+Key: PhyReg_TILEPAIR:  [ 0.00  0.00 ]
+Key: VirtReg_GR8:  [ 0.00  0.00 ]
+Key: VirtReg_GRH8:  [ 0.00  0.00 ]
+Key: VirtReg_GR8_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_GR8_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_GR8_ABCD_H:  [ 0.00  0.00 ]
+Key: VirtReg_GR8_ABCD_L:  [ 0.00  0.00 ]
+Key: VirtReg_GRH16:  [ 0.00  0.00 ]
+Key: VirtReg_GR16:  [ 0.00  0.00 ]
+Key: VirtReg_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_GR16_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_VK1:  [ 0.00  0.00 ]
+Key: VirtReg_VK16:  [ 0.00  0.00 ]
+Key: VirtReg_VK2:  [ 0.00  0.00 ]
+Key: VirtReg_VK4:  [ 0.00  0.00 ]
+Key: VirtReg_VK8:  [ 0.00  0.00 ]
+Key: VirtReg_VK16WM:  [ 0.00  0.00 ]
+Key: VirtReg_VK1WM:  [ 0.00  0.00 ]
+Key: VirtReg_VK2WM:  [ 0.00  0.00 ]
+Key: VirtReg_VK4WM:  [ 0.00  0.00 ]
+Key: VirtReg_VK8WM:  [ 0.00  0.00 ]
+Key: VirtReg_SEGMENT_REG:  [ 0.00  0.00 ]
+Key: VirtReg_GR16_ABCD:  [ 0.00  0.00 ]
+Key: VirtReg_FPCCR:  [ 0.00  0.00 ]
+Key: VirtReg_FR16X:  [ 0.00  0.00 ]
+Key: VirtReg_FR16:  [ 0.00  0.00 ]
+Key: VirtReg_VK16PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK1PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK2PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK4PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK8PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK1PAIR_with_sub_mask_0_in_VK1WM:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_FR32X:  [ 0.00  0.00 ]
+Key: VirtReg_GR32:  [ 0.80  0.80 ]
+Key: VirtReg_GR32_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_DEBUG_REG:  [ 0.00  0.00 ]
+Key: VirtReg_FR32:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_NOREX2_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_VK32:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_NOREX_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_RFP32:  [ 0.00  0.00 ]
+Key: VirtReg_VK32WM:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ABCD:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ABCD_and_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_AD:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BPSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BSI:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_CB:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_DC:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_DIBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit:  [ 0.00  0.00 ]
+Key: VirtReg_CCR:  [ 0.00  0.00 ]
+Key: VirtReg_DFCCR:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ABCD_and_GR32_BSI:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_AD_and_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ArgRef_and_GR32_CB:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BPSP_and_GR32_DIBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BPSP_and_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BSI_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_DIBP_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_with_sub_32bit:  [ 0.00  0.00 ]
+Key: VirtReg_RFP64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64:  [ 0.90  0.90 ]
+Key: VirtReg_FR64X:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_CONTROL_REG:  [ 0.00  0.00 ]
+Key: VirtReg_FR64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_16bit_in_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX2_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64PLTSafe:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX2_NOSP_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TCW64_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_VK64:  [ 0.00  0.00 ]
+Key: VirtReg_VR64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64PLTSafe_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX2_NOSP_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TCW64_and_GR64_TC_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_VK64WM:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_GR64PLTSafe_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_ABCD:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_AD:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_and_LOW32_ADDR_ACCESS_RBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BSI:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_CB:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_DIBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_A:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_ArgRef_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_and_LOW32_ADDR_ACCESS:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_RST:  [ 0.00  0.00 ]
+Key: VirtReg_RFP80:  [ 0.00  0.00 ]
+Key: VirtReg_RFP80_7:  [ 0.00  0.00 ]
+Key: VirtReg_VR128X:  [ 0.00  0.00 ]
+Key: VirtReg_VR128:  [ 0.00  0.00 ]
+Key: VirtReg_VR256X:  [ 0.00  0.00 ]
+Key: VirtReg_VR256:  [ 0.00  0.00 ]
+Key: VirtReg_VR512:  [ 0.00  0.00 ]
+Key: VirtReg_VR512_0_15:  [ 0.00  0.00 ]
+Key: VirtReg_TILE:  [ 0.00  0.00 ]
+Key: VirtReg_TILEPAIR:  [ 0.00  0.00 ]
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt
index 4409e6d..c6e5508 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt
@@ -6880,3 +6880,294 @@ Key: XSHA:  [ 0.00  0.00 ]
 Key: XSTORE:  [ 0.00  0.00 ]
 Key: XSUSLDTRK:  [ 0.00  0.00 ]
 Key: XTEST:  [ 0.00  0.00 ]
+Key: Immediate:  [ 0.10  0.10 ]
+Key: CImmediate:  [ 0.00  0.00 ]
+Key: FPImmediate:  [ 0.00  0.00 ]
+Key: MBB:  [ 0.20  0.20 ]
+Key: FrameIndex:  [ 0.30  0.30 ]
+Key: ConstantPoolIndex:  [ 0.00  0.00 ]
+Key: TargetIndex:  [ 0.00  0.00 ]
+Key: JumpTableIndex:  [ 0.00  0.00 ]
+Key: ExternalSymbol:  [ 0.00  0.00 ]
+Key: GlobalAddress:  [ 0.40  0.40 ]
+Key: BlockAddress:  [ 0.00  0.00 ]
+Key: RegisterMask:  [ 0.00  0.00 ]
+Key: RegisterLiveOut:  [ 0.00  0.00 ]
+Key: Metadata:  [ 0.00  0.00 ]
+Key: MCSymbol:  [ 0.00  0.00 ]
+Key: CFIIndex:  [ 0.00  0.00 ]
+Key: IntrinsicID:  [ 0.00  0.00 ]
+Key: Predicate:  [ 0.00  0.00 ]
+Key: ShuffleMask:  [ 0.00  0.00 ]
+Key: PhyReg_GR8:  [ 0.00  0.00 ]
+Key: PhyReg_GRH8:  [ 0.00  0.00 ]
+Key: PhyReg_GR8_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_GR8_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_GR8_ABCD_H:  [ 0.00  0.00 ]
+Key: PhyReg_GR8_ABCD_L:  [ 0.00  0.00 ]
+Key: PhyReg_GRH16:  [ 0.00  0.00 ]
+Key: PhyReg_GR16:  [ 0.00  0.00 ]
+Key: PhyReg_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_GR16_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_VK1:  [ 0.00  0.00 ]
+Key: PhyReg_VK16:  [ 0.00  0.00 ]
+Key: PhyReg_VK2:  [ 0.00  0.00 ]
+Key: PhyReg_VK4:  [ 0.00  0.00 ]
+Key: PhyReg_VK8:  [ 0.00  0.00 ]
+Key: PhyReg_VK16WM:  [ 0.00  0.00 ]
+Key: PhyReg_VK1WM:  [ 0.00  0.00 ]
+Key: PhyReg_VK2WM:  [ 0.00  0.00 ]
+Key: PhyReg_VK4WM:  [ 0.00  0.00 ]
+Key: PhyReg_VK8WM:  [ 0.00  0.00 ]
+Key: PhyReg_SEGMENT_REG:  [ 0.00  0.00 ]
+Key: PhyReg_GR16_ABCD:  [ 0.00  0.00 ]
+Key: PhyReg_FPCCR:  [ 0.00  0.00 ]
+Key: PhyReg_FR16X:  [ 0.00  0.00 ]
+Key: PhyReg_FR16:  [ 0.00  0.00 ]
+Key: PhyReg_VK16PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK1PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK2PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK4PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK8PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK1PAIR_with_sub_mask_0_in_VK1WM:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_FR32X:  [ 0.00  0.00 ]
+Key: PhyReg_GR32:  [ 0.50  0.50 ]
+Key: PhyReg_GR32_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_DEBUG_REG:  [ 0.00  0.00 ]
+Key: PhyReg_FR32:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_NOREX2_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_VK32:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_NOREX_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_RFP32:  [ 0.00  0.00 ]
+Key: PhyReg_VK32WM:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ABCD:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ABCD_and_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_AD:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BPSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BSI:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_CB:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_DC:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_DIBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit:  [ 0.00  0.00 ]
+Key: PhyReg_CCR:  [ 0.00  0.00 ]
+Key: PhyReg_DFCCR:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ABCD_and_GR32_BSI:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_AD_and_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ArgRef_and_GR32_CB:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BPSP_and_GR32_DIBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BPSP_and_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BSI_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_DIBP_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_with_sub_32bit:  [ 0.00  0.00 ]
+Key: PhyReg_RFP64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64:  [ 0.60  0.60 ]
+Key: PhyReg_FR64X:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_CONTROL_REG:  [ 0.00  0.00 ]
+Key: PhyReg_FR64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_16bit_in_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX2_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64PLTSafe:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX2_NOSP_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TCW64_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_VK64:  [ 0.00  0.00 ]
+Key: PhyReg_VR64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64PLTSafe_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX2_NOSP_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TCW64_and_GR64_TC_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_VK64WM:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_GR64PLTSafe_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_ABCD:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_AD:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_and_LOW32_ADDR_ACCESS_RBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BSI:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_CB:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_DIBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_A:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_ArgRef_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_and_LOW32_ADDR_ACCESS:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_RST:  [ 0.00  0.00 ]
+Key: PhyReg_RFP80:  [ 0.00  0.00 ]
+Key: PhyReg_RFP80_7:  [ 0.00  0.00 ]
+Key: PhyReg_VR128X:  [ 0.00  0.00 ]
+Key: PhyReg_VR128:  [ 0.00  0.00 ]
+Key: PhyReg_VR256X:  [ 0.00  0.00 ]
+Key: PhyReg_VR256:  [ 0.00  0.00 ]
+Key: PhyReg_VR512:  [ 0.00  0.00 ]
+Key: PhyReg_VR512_0_15:  [ 0.00  0.00 ]
+Key: PhyReg_TILE:  [ 0.00  0.00 ]
+Key: PhyReg_TILEPAIR:  [ 0.00  0.00 ]
+Key: VirtReg_GR8:  [ 0.00  0.00 ]
+Key: VirtReg_GRH8:  [ 0.00  0.00 ]
+Key: VirtReg_GR8_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_GR8_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_GR8_ABCD_H:  [ 0.00  0.00 ]
+Key: VirtReg_GR8_ABCD_L:  [ 0.00  0.00 ]
+Key: VirtReg_GRH16:  [ 0.00  0.00 ]
+Key: VirtReg_GR16:  [ 0.00  0.00 ]
+Key: VirtReg_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_GR16_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_VK1:  [ 0.00  0.00 ]
+Key: VirtReg_VK16:  [ 0.00  0.00 ]
+Key: VirtReg_VK2:  [ 0.00  0.00 ]
+Key: VirtReg_VK4:  [ 0.00  0.00 ]
+Key: VirtReg_VK8:  [ 0.00  0.00 ]
+Key: VirtReg_VK16WM:  [ 0.00  0.00 ]
+Key: VirtReg_VK1WM:  [ 0.00  0.00 ]
+Key: VirtReg_VK2WM:  [ 0.00  0.00 ]
+Key: VirtReg_VK4WM:  [ 0.00  0.00 ]
+Key: VirtReg_VK8WM:  [ 0.00  0.00 ]
+Key: VirtReg_SEGMENT_REG:  [ 0.00  0.00 ]
+Key: VirtReg_GR16_ABCD:  [ 0.00  0.00 ]
+Key: VirtReg_FPCCR:  [ 0.00  0.00 ]
+Key: VirtReg_FR16X:  [ 0.00  0.00 ]
+Key: VirtReg_FR16:  [ 0.00  0.00 ]
+Key: VirtReg_VK16PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK1PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK2PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK4PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK8PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK1PAIR_with_sub_mask_0_in_VK1WM:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_FR32X:  [ 0.00  0.00 ]
+Key: VirtReg_GR32:  [ 0.80  0.80 ]
+Key: VirtReg_GR32_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_DEBUG_REG:  [ 0.00  0.00 ]
+Key: VirtReg_FR32:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_NOREX2_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_VK32:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_NOREX_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_RFP32:  [ 0.00  0.00 ]
+Key: VirtReg_VK32WM:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ABCD:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ABCD_and_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_AD:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BPSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BSI:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_CB:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_DC:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_DIBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit:  [ 0.00  0.00 ]
+Key: VirtReg_CCR:  [ 0.00  0.00 ]
+Key: VirtReg_DFCCR:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ABCD_and_GR32_BSI:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_AD_and_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ArgRef_and_GR32_CB:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BPSP_and_GR32_DIBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BPSP_and_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BSI_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_DIBP_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_with_sub_32bit:  [ 0.00  0.00 ]
+Key: VirtReg_RFP64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64:  [ 0.90  0.90 ]
+Key: VirtReg_FR64X:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_CONTROL_REG:  [ 0.00  0.00 ]
+Key: VirtReg_FR64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_16bit_in_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX2_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64PLTSafe:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX2_NOSP_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TCW64_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_VK64:  [ 0.00  0.00 ]
+Key: VirtReg_VR64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64PLTSafe_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX2_NOSP_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TCW64_and_GR64_TC_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_VK64WM:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_GR64PLTSafe_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_ABCD:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_AD:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_and_LOW32_ADDR_ACCESS_RBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BSI:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_CB:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_DIBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_A:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_ArgRef_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_and_LOW32_ADDR_ACCESS:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_RST:  [ 0.00  0.00 ]
+Key: VirtReg_RFP80:  [ 0.00  0.00 ]
+Key: VirtReg_RFP80_7:  [ 0.00  0.00 ]
+Key: VirtReg_VR128X:  [ 0.00  0.00 ]
+Key: VirtReg_VR128:  [ 0.00  0.00 ]
+Key: VirtReg_VR256X:  [ 0.00  0.00 ]
+Key: VirtReg_VR256:  [ 0.00  0.00 ]
+Key: VirtReg_VR512:  [ 0.00  0.00 ]
+Key: VirtReg_VR512_0_15:  [ 0.00  0.00 ]
+Key: VirtReg_TILE:  [ 0.00  0.00 ]
+Key: VirtReg_TILEPAIR:  [ 0.00  0.00 ]
diff --git a/llvm/test/CodeGen/MIR2Vec/if-else.mir b/llvm/test/CodeGen/MIR2Vec/if-else.mir
index 5734a23..f2572f5 100644
--- a/llvm/test/CodeGen/MIR2Vec/if-else.mir
+++ b/llvm/test/CodeGen/MIR2Vec/if-else.mir
@@ -135,10 +135,10 @@ body:             |
 
 # CHECK: Machine basic block vectors:
 # CHECK-NEXT: Machine basic block: abc:entry:
-# CHECK-NEXT:  [ 16.50  17.10  17.70 ]
+# CHECK-NEXT:  [ 23.60  24.20  24.80 ]
 # CHECK-NEXT: Machine basic block: abc:if.then:
-# CHECK-NEXT:  [ 4.50  4.80  5.10 ]
+# CHECK-NEXT:  [ 7.30  7.60  7.90 ]
 # CHECK-NEXT: Machine basic block: abc:if.else:
-# CHECK-NEXT:  [ 0.80  1.00  1.20 ]
+# CHECK-NEXT:  [ 3.40  3.60  3.80 ]
 # CHECK-NEXT: Machine basic block: abc:return:
-# CHECK-NEXT:  [ 6.60  6.90  7.20 ]
-\ No newline at end of file
+# CHECK-NEXT:  [ 8.80  9.10  9.40 ]
diff --git a/llvm/test/CodeGen/MIR2Vec/mir2vec-basic-symbolic.mir b/llvm/test/CodeGen/MIR2Vec/mir2vec-basic-symbolic.mir
index 338cb63..0fdcc81 100644
--- a/llvm/test/CodeGen/MIR2Vec/mir2vec-basic-symbolic.mir
+++ b/llvm/test/CodeGen/MIR2Vec/mir2vec-basic-symbolic.mir
@@ -48,29 +48,29 @@ body:             |
     RET 0
 
 # CHECK: MIR2Vec embeddings for machine function add_function:
-# CHECK: Function vector:  [ 19.20  19.80  20.40 ]
+# CHECK: Function vector:  [ 26.50  27.10  27.70 ]
 # CHECK-NEXT: Machine basic block vectors:
 # CHECK-NEXT: Machine basic block: add_function:entry:
-# CHECK-NEXT:  [ 19.20  19.80  20.40 ]
+# CHECK-NEXT:  [ 26.50  27.10  27.70 ]
 # CHECK-NEXT: Machine instruction vectors:
 # CHECK-NEXT: Machine instruction: %1:gr32 = COPY $esi
-# CHECK-NEXT:  [ 5.20  5.30  5.40 ]
+# CHECK-NEXT:  [ 6.00  6.10  6.20 ]
 # CHECK-NEXT: Machine instruction: %0:gr32 = COPY $edi
-# CHECK-NEXT:  [ 5.20  5.30  5.40 ]
+# CHECK-NEXT:  [ 6.00  6.10  6.20 ]
 # CHECK-NEXT: Machine instruction: %2:gr32 = nsw ADD32rr %0:gr32(tied-def 0), %1:gr32, implicit-def dead $eflags
-# CHECK-NEXT:  [ 1.30  1.40  1.50 ]
+# CHECK-NEXT:  [ 3.70  3.80  3.90 ]
 # CHECK-NEXT: Machine instruction: %3:gr32 = ADD32rr %2:gr32(tied-def 0), %2:gr32, implicit-def dead $eflags
-# CHECK-NEXT:  [ 1.30  1.40  1.50 ]
+# CHECK-NEXT:  [ 3.70  3.80  3.90 ]
 # CHECK-NEXT: Machine instruction: $eax = COPY %3:gr32
-# CHECK-NEXT:  [ 5.20  5.30  5.40 ]
+# CHECK-NEXT:  [ 6.00  6.10  6.20 ]
 # CHECK-NEXT: Machine instruction: RET 0, $eax
-# CHECK-NEXT:  [ 1.00  1.10  1.20 ]
+# CHECK-NEXT:  [ 1.10  1.20  1.30 ]
 
 # CHECK: MIR2Vec embeddings for machine function simple_function:
-# CHECK-NEXT:Function vector:  [ 1.00  1.10  1.20 ]
+# CHECK-NEXT:Function vector:  [ 1.10  1.20  1.30 ]
 # CHECK-NEXT: Machine basic block vectors:
 # CHECK-NEXT: Machine basic block: simple_function:entry:
-# CHECK-NEXT:  [ 1.00  1.10  1.20 ]
+# CHECK-NEXT:  [ 1.10  1.20  1.30 ]
 # CHECK-NEXT: Machine instruction vectors:
 # CHECK-NEXT: Machine instruction: RET 0
-# CHECK-NEXT:  [ 1.00  1.10  1.20 ]
-\ No newline at end of file
+# CHECK-NEXT:  [ 1.10  1.20  1.30 ]
diff --git a/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll b/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll
index c6554bc..13e908e 100644
--- a/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll
+++ b/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll
@@ -10,6 +10,6 @@ define dso_local void @test() {
 }
 
 ; CHECK-INVALID: MIR2Vec Vocabulary Printer: Failed to get vocabulary - MIR2Vec vocabulary file path not specified; set it using --mir2vec-vocab-path
-; CHECK-ZERO-DIM: MIR2Vec Vocabulary Printer: Failed to get vocabulary - Dimension of 'entities' section of the vocabulary is zero
-; CHECK-NO-ENTITIES: MIR2Vec Vocabulary Printer: Failed to get vocabulary - Missing 'entities' section in vocabulary file
-; CHECK-INCONSISTENT-DIMS: MIR2Vec Vocabulary Printer: Failed to get vocabulary - All vectors in the 'entities' section of the vocabulary are not of the same dimension
+; CHECK-ZERO-DIM: MIR2Vec Vocabulary Printer: Failed to get vocabulary - Dimension of 'Opcodes' section of the vocabulary is zero
+; CHECK-NO-ENTITIES: MIR2Vec Vocabulary Printer: Failed to get vocabulary - Missing 'Opcodes' section in vocabulary file
+; CHECK-INCONSISTENT-DIMS: MIR2Vec Vocabulary Printer: Failed to get vocabulary - All vectors in the 'Opcodes' section of the vocabulary are not of the same dimension
diff --git a/llvm/test/CodeGen/MSP430/libcalls.ll b/llvm/test/CodeGen/MSP430/libcalls.ll
index 5d3755c..d1bafea 100644
--- a/llvm/test/CodeGen/MSP430/libcalls.ll
+++ b/llvm/test/CodeGen/MSP430/libcalls.ll
@@ -639,4 +639,18 @@ entry:
   ret i32 %shr
 }
 
+define i64 @test__mspabi_divull(i64 %a, i64 %b) #0 {
+; CHECK-LABEL: test__mspabi_divull:
+; CHECK: call #__mspabi_divull
+  %result = udiv i64 %a, %b
+  ret i64 %result
+}
+
+define i64 @test__mspabi_remull(i64 %a, i64 %b) #0 {
+; CHECK-LABEL: test__mspabi_remull:
+; CHECK: call #__mspabi_remull
+  %result = urem i64 %a, %b
+  ret i64 %result
+}
+
 attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/store-fp-zero-to-x0.ll b/llvm/test/CodeGen/RISCV/GlobalISel/store-fp-zero-to-x0.ll
new file mode 100644
index 0000000..bc79c6f
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/store-fp-zero-to-x0.ll
@@ -0,0 +1,320 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -mtriple=riscv32 -mattr=+f,+zfh < %s \
+; RUN:   | FileCheck %s --check-prefix=RV32F
+; RUN: llc -global-isel -mtriple=riscv32 -mattr=+d,+zfh < %s \
+; RUN:   | FileCheck %s --check-prefix=RV32D
+; RUN: llc -global-isel -mtriple=riscv64 -mattr=+f,+zfh < %s \
+; RUN:   | FileCheck %s --check-prefix=RV64F
+; RUN: llc -global-isel -mtriple=riscv64 -mattr=+d,+zfh < %s \
+; RUN:   | FileCheck %s --check-prefix=RV64D
+
+define void @zero_f16(ptr %i) {
+; RV32F-LABEL: zero_f16:
+; RV32F:       # %bb.0: # %entry
+; RV32F-NEXT:    sh zero, 0(a0)
+; RV32F-NEXT:    ret
+;
+; RV32D-LABEL: zero_f16:
+; RV32D:       # %bb.0: # %entry
+; RV32D-NEXT:    sh zero, 0(a0)
+; RV32D-NEXT:    ret
+;
+; RV64F-LABEL: zero_f16:
+; RV64F:       # %bb.0: # %entry
+; RV64F-NEXT:    sh zero, 0(a0)
+; RV64F-NEXT:    ret
+;
+; RV64D-LABEL: zero_f16:
+; RV64D:       # %bb.0: # %entry
+; RV64D-NEXT:    sh zero, 0(a0)
+; RV64D-NEXT:    ret
+entry:
+  store half 0.0, ptr %i, align 4
+  ret void
+}
+
+define void @zero_bf16(ptr %i) {
+; RV32F-LABEL: zero_bf16:
+; RV32F:       # %bb.0: # %entry
+; RV32F-NEXT:    sh zero, 0(a0)
+; RV32F-NEXT:    ret
+;
+; RV32D-LABEL: zero_bf16:
+; RV32D:       # %bb.0: # %entry
+; RV32D-NEXT:    sh zero, 0(a0)
+; RV32D-NEXT:    ret
+;
+; RV64F-LABEL: zero_bf16:
+; RV64F:       # %bb.0: # %entry
+; RV64F-NEXT:    sh zero, 0(a0)
+; RV64F-NEXT:    ret
+;
+; RV64D-LABEL: zero_bf16:
+; RV64D:       # %bb.0: # %entry
+; RV64D-NEXT:    sh zero, 0(a0)
+; RV64D-NEXT:    ret
+entry:
+  store bfloat 0.0, ptr %i, align 4
+  ret void
+}
+
+define void @zero_f32(ptr %i) {
+; RV32F-LABEL: zero_f32:
+; RV32F:       # %bb.0: # %entry
+; RV32F-NEXT:    sw zero, 0(a0)
+; RV32F-NEXT:    ret
+;
+; RV32D-LABEL: zero_f32:
+; RV32D:       # %bb.0: # %entry
+; RV32D-NEXT:    sw zero, 0(a0)
+; RV32D-NEXT:    ret
+;
+; RV64F-LABEL: zero_f32:
+; RV64F:       # %bb.0: # %entry
+; RV64F-NEXT:    sw zero, 0(a0)
+; RV64F-NEXT:    ret
+;
+; RV64D-LABEL: zero_f32:
+; RV64D:       # %bb.0: # %entry
+; RV64D-NEXT:    sw zero, 0(a0)
+; RV64D-NEXT:    ret
+entry:
+  store float 0.0, ptr %i, align 4
+  ret void
+}
+
+
+define void @zero_f64(ptr %i) {
+; RV32F-LABEL: zero_f64:
+; RV32F:       # %bb.0: # %entry
+; RV32F-NEXT:    lui a1, %hi(.LCPI3_0)
+; RV32F-NEXT:    addi a1, a1, %lo(.LCPI3_0)
+; RV32F-NEXT:    lw a2, 0(a1)
+; RV32F-NEXT:    lw a1, 4(a1)
+; RV32F-NEXT:    sw a2, 0(a0)
+; RV32F-NEXT:    sw a1, 4(a0)
+; RV32F-NEXT:    ret
+;
+; RV32D-LABEL: zero_f64:
+; RV32D:       # %bb.0: # %entry
+; RV32D-NEXT:    fcvt.d.w fa5, zero
+; RV32D-NEXT:    fsd fa5, 0(a0)
+; RV32D-NEXT:    ret
+;
+; RV64F-LABEL: zero_f64:
+; RV64F:       # %bb.0: # %entry
+; RV64F-NEXT:    sd zero, 0(a0)
+; RV64F-NEXT:    ret
+;
+; RV64D-LABEL: zero_f64:
+; RV64D:       # %bb.0: # %entry
+; RV64D-NEXT:    sd zero, 0(a0)
+; RV64D-NEXT:    ret
+entry:
+  store double 0.0, ptr %i, align 8
+  ret void
+}
+
+define void @zero_v1f32(ptr %i) {
+; RV32F-LABEL: zero_v1f32:
+; RV32F:       # %bb.0: # %entry
+; RV32F-NEXT:    sw zero, 0(a0)
+; RV32F-NEXT:    ret
+;
+; RV32D-LABEL: zero_v1f32:
+; RV32D:       # %bb.0: # %entry
+; RV32D-NEXT:    sw zero, 0(a0)
+; RV32D-NEXT:    ret
+;
+; RV64F-LABEL: zero_v1f32:
+; RV64F:       # %bb.0: # %entry
+; RV64F-NEXT:    sw zero, 0(a0)
+; RV64F-NEXT:    ret
+;
+; RV64D-LABEL: zero_v1f32:
+; RV64D:       # %bb.0: # %entry
+; RV64D-NEXT:    sw zero, 0(a0)
+; RV64D-NEXT:    ret
+entry:
+  store <1 x float> <float 0.0>, ptr %i, align 8
+  ret void
+}
+
+define void @zero_v2f32(ptr %i) {
+; RV32F-LABEL: zero_v2f32:
+; RV32F:       # %bb.0: # %entry
+; RV32F-NEXT:    sw zero, 0(a0)
+; RV32F-NEXT:    sw zero, 4(a0)
+; RV32F-NEXT:    ret
+;
+; RV32D-LABEL: zero_v2f32:
+; RV32D:       # %bb.0: # %entry
+; RV32D-NEXT:    sw zero, 0(a0)
+; RV32D-NEXT:    sw zero, 4(a0)
+; RV32D-NEXT:    ret
+;
+; RV64F-LABEL: zero_v2f32:
+; RV64F:       # %bb.0: # %entry
+; RV64F-NEXT:    sw zero, 0(a0)
+; RV64F-NEXT:    sw zero, 4(a0)
+; RV64F-NEXT:    ret
+;
+; RV64D-LABEL: zero_v2f32:
+; RV64D:       # %bb.0: # %entry
+; RV64D-NEXT:    sw zero, 0(a0)
+; RV64D-NEXT:    sw zero, 4(a0)
+; RV64D-NEXT:    ret
+entry:
+  store <2 x float> <float 0.0, float 0.0>, ptr %i, align 8
+  ret void
+}
+
+define void @zero_v4f32(ptr %i) {
+; RV32F-LABEL: zero_v4f32:
+; RV32F:       # %bb.0: # %entry
+; RV32F-NEXT:    sw zero, 0(a0)
+; RV32F-NEXT:    sw zero, 4(a0)
+; RV32F-NEXT:    sw zero, 8(a0)
+; RV32F-NEXT:    sw zero, 12(a0)
+; RV32F-NEXT:    ret
+;
+; RV32D-LABEL: zero_v4f32:
+; RV32D:       # %bb.0: # %entry
+; RV32D-NEXT:    sw zero, 0(a0)
+; RV32D-NEXT:    sw zero, 4(a0)
+; RV32D-NEXT:    sw zero, 8(a0)
+; RV32D-NEXT:    sw zero, 12(a0)
+; RV32D-NEXT:    ret
+;
+; RV64F-LABEL: zero_v4f32:
+; RV64F:       # %bb.0: # %entry
+; RV64F-NEXT:    sw zero, 0(a0)
+; RV64F-NEXT:    sw zero, 4(a0)
+; RV64F-NEXT:    sw zero, 8(a0)
+; RV64F-NEXT:    sw zero, 12(a0)
+; RV64F-NEXT:    ret
+;
+; RV64D-LABEL: zero_v4f32:
+; RV64D:       # %bb.0: # %entry
+; RV64D-NEXT:    sw zero, 0(a0)
+; RV64D-NEXT:    sw zero, 4(a0)
+; RV64D-NEXT:    sw zero, 8(a0)
+; RV64D-NEXT:    sw zero, 12(a0)
+; RV64D-NEXT:    ret
+entry:
+  store <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>, ptr %i, align 8
+  ret void
+}
+
+define void @zero_v1f64(ptr %i) {
+; RV32F-LABEL: zero_v1f64:
+; RV32F:       # %bb.0: # %entry
+; RV32F-NEXT:    lui a1, %hi(.LCPI7_0)
+; RV32F-NEXT:    addi a1, a1, %lo(.LCPI7_0)
+; RV32F-NEXT:    lw a2, 0(a1)
+; RV32F-NEXT:    lw a1, 4(a1)
+; RV32F-NEXT:    sw a2, 0(a0)
+; RV32F-NEXT:    sw a1, 4(a0)
+; RV32F-NEXT:    ret
+;
+; RV32D-LABEL: zero_v1f64:
+; RV32D:       # %bb.0: # %entry
+; RV32D-NEXT:    fcvt.d.w fa5, zero
+; RV32D-NEXT:    fsd fa5, 0(a0)
+; RV32D-NEXT:    ret
+;
+; RV64F-LABEL: zero_v1f64:
+; RV64F:       # %bb.0: # %entry
+; RV64F-NEXT:    sd zero, 0(a0)
+; RV64F-NEXT:    ret
+;
+; RV64D-LABEL: zero_v1f64:
+; RV64D:       # %bb.0: # %entry
+; RV64D-NEXT:    sd zero, 0(a0)
+; RV64D-NEXT:    ret
+entry:
+  store <1 x double> <double 0.0>, ptr %i, align 8
+  ret void
+}
+
+define void @zero_v2f64(ptr %i) {
+; RV32F-LABEL: zero_v2f64:
+; RV32F:       # %bb.0: # %entry
+; RV32F-NEXT:    lui a1, %hi(.LCPI8_0)
+; RV32F-NEXT:    addi a1, a1, %lo(.LCPI8_0)
+; RV32F-NEXT:    lw a2, 0(a1)
+; RV32F-NEXT:    lw a1, 4(a1)
+; RV32F-NEXT:    sw a2, 0(a0)
+; RV32F-NEXT:    sw a1, 4(a0)
+; RV32F-NEXT:    sw a2, 8(a0)
+; RV32F-NEXT:    sw a1, 12(a0)
+; RV32F-NEXT:    ret
+;
+; RV32D-LABEL: zero_v2f64:
+; RV32D:       # %bb.0: # %entry
+; RV32D-NEXT:    fcvt.d.w fa5, zero
+; RV32D-NEXT:    fsd fa5, 0(a0)
+; RV32D-NEXT:    fsd fa5, 8(a0)
+; RV32D-NEXT:    ret
+;
+; RV64F-LABEL: zero_v2f64:
+; RV64F:       # %bb.0: # %entry
+; RV64F-NEXT:    sd zero, 0(a0)
+; RV64F-NEXT:    sd zero, 8(a0)
+; RV64F-NEXT:    ret
+;
+; RV64D-LABEL: zero_v2f64:
+; RV64D:       # %bb.0: # %entry
+; RV64D-NEXT:    sd zero, 0(a0)
+; RV64D-NEXT:    sd zero, 8(a0)
+; RV64D-NEXT:    ret
+entry:
+  store <2 x double> <double 0.0, double 0.0>, ptr %i, align 8
+  ret void
+}
+
+define void @zero_v4f64(ptr %i) {
+; RV32F-LABEL: zero_v4f64:
+; RV32F:       # %bb.0: # %entry
+; RV32F-NEXT:    lui a1, %hi(.LCPI9_0)
+; RV32F-NEXT:    addi a1, a1, %lo(.LCPI9_0)
+; RV32F-NEXT:    lw a2, 0(a1)
+; RV32F-NEXT:    lw a1, 4(a1)
+; RV32F-NEXT:    sw a2, 0(a0)
+; RV32F-NEXT:    sw a1, 4(a0)
+; RV32F-NEXT:    sw a2, 8(a0)
+; RV32F-NEXT:    sw a1, 12(a0)
+; RV32F-NEXT:    sw a2, 16(a0)
+; RV32F-NEXT:    sw a1, 20(a0)
+; RV32F-NEXT:    sw a2, 24(a0)
+; RV32F-NEXT:    sw a1, 28(a0)
+; RV32F-NEXT:    ret
+;
+; RV32D-LABEL: zero_v4f64:
+; RV32D:       # %bb.0: # %entry
+; RV32D-NEXT:    fcvt.d.w fa5, zero
+; RV32D-NEXT:    fsd fa5, 0(a0)
+; RV32D-NEXT:    fsd fa5, 8(a0)
+; RV32D-NEXT:    fsd fa5, 16(a0)
+; RV32D-NEXT:    fsd fa5, 24(a0)
+; RV32D-NEXT:    ret
+;
+; RV64F-LABEL: zero_v4f64:
+; RV64F:       # %bb.0: # %entry
+; RV64F-NEXT:    sd zero, 0(a0)
+; RV64F-NEXT:    sd zero, 8(a0)
+; RV64F-NEXT:    sd zero, 16(a0)
+; RV64F-NEXT:    sd zero, 24(a0)
+; RV64F-NEXT:    ret
+;
+; RV64D-LABEL: zero_v4f64:
+; RV64D:       # %bb.0: # %entry
+; RV64D-NEXT:    sd zero, 0(a0)
+; RV64D-NEXT:    sd zero, 8(a0)
+; RV64D-NEXT:    sd zero, 16(a0)
+; RV64D-NEXT:    sd zero, 24(a0)
+; RV64D-NEXT:    ret
+entry:
+  store <4 x double> <double 0.0, double 0.0, double 0.0, double 0.0>, ptr %i, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rv32p.ll b/llvm/test/CodeGen/RISCV/rv32p.ll
new file mode 100644
index 0000000..4eee880a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv32p.ll
@@ -0,0 +1,709 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-p -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s
+
+declare i32 @llvm.ctlz.i32(i32, i1)
+
+define i32 @ctlz_i32(i32 %a) nounwind {
+; CHECK-LABEL: ctlz_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    clz a0, a0
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+  ret i32 %1
+}
+
+declare i64 @llvm.ctlz.i64(i64, i1)
+
+define i64 @ctlz_i64(i64 %a) nounwind {
+; CHECK-LABEL: ctlz_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    bnez a1, .LBB1_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    clz a0, a0
+; CHECK-NEXT:    addi a0, a0, 32
+; CHECK-NEXT:    li a1, 0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB1_2:
+; CHECK-NEXT:    clz a0, a1
+; CHECK-NEXT:    li a1, 0
+; CHECK-NEXT:    ret
+  %1 = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
+  ret i64 %1
+}
+
+declare i32 @llvm.cttz.i32(i32, i1)
+
+define i32 @cttz_i32(i32 %a) nounwind {
+; CHECK-LABEL: cttz_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    beqz a0, .LBB2_2
+; CHECK-NEXT:  # %bb.1: # %cond.false
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    not a0, a0
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    clz a0, a0
+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB2_2:
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+  ret i32 %1
+}
+
+declare i64 @llvm.cttz.i64(i64, i1)
+
+define i64 @cttz_i64(i64 %a) nounwind {
+; CHECK-LABEL: cttz_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    or a2, a0, a1
+; CHECK-NEXT:    beqz a2, .LBB3_3
+; CHECK-NEXT:  # %bb.1: # %cond.false
+; CHECK-NEXT:    bnez a0, .LBB3_4
+; CHECK-NEXT:  # %bb.2: # %cond.false
+; CHECK-NEXT:    addi a0, a1, -1
+; CHECK-NEXT:    not a1, a1
+; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    clz a0, a0
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    j .LBB3_5
+; CHECK-NEXT:  .LBB3_3:
+; CHECK-NEXT:    li a1, 0
+; CHECK-NEXT:    li a0, 64
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB3_4:
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    not a0, a0
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    clz a0, a0
+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:  .LBB3_5: # %cond.false
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    li a1, 0
+; CHECK-NEXT:    ret
+  %1 = call i64 @llvm.cttz.i64(i64 %a, i1 false)
+  ret i64 %1
+}
+
+define i32 @sextb_i32(i32 %a) nounwind {
+; CHECK-LABEL: sextb_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.b a0, a0
+; CHECK-NEXT:    ret
+  %shl = shl i32 %a, 24
+  %shr = ashr exact i32 %shl, 24
+  ret i32 %shr
+}
+
+define i64 @sextb_i64(i64 %a) nounwind {
+; CHECK-LABEL: sextb_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.b a0, a0
+; CHECK-NEXT:    srai a1, a0, 31
+; CHECK-NEXT:    ret
+  %shl = shl i64 %a, 56
+  %shr = ashr exact i64 %shl, 56
+  ret i64 %shr
+}
+
+define i32 @sexth_i32(i32 %a) nounwind {
+; CHECK-LABEL: sexth_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.h a0, a0
+; CHECK-NEXT:    ret
+  %shl = shl i32 %a, 16
+  %shr = ashr exact i32 %shl, 16
+  ret i32 %shr
+}
+
+define i64 @sexth_i64(i64 %a) nounwind {
+; CHECK-LABEL: sexth_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.h a0, a0
+; CHECK-NEXT:    srai a1, a0, 31
+; CHECK-NEXT:    ret
+  %shl = shl i64 %a, 48
+  %shr = ashr exact i64 %shl, 48
+  ret i64 %shr
+}
+
+define i32 @min_i32(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: min_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    min a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+; As we are not matching directly i64 code patterns on RV32 some i64 patterns
+; don't have yet any matching bit manipulation instructions on RV32.
+; This test is presented here in case future expansions of the Bitmanip
+; extensions introduce instructions suitable for this pattern.
+
+define i64 @min_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: min_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    beq a1, a3, .LBB9_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    slt a4, a1, a3
+; CHECK-NEXT:    beqz a4, .LBB9_3
+; CHECK-NEXT:    j .LBB9_4
+; CHECK-NEXT:  .LBB9_2:
+; CHECK-NEXT:    sltu a4, a0, a2
+; CHECK-NEXT:    bnez a4, .LBB9_4
+; CHECK-NEXT:  .LBB9_3:
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    mv a1, a3
+; CHECK-NEXT:  .LBB9_4:
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+define i32 @max_i32(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: max_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    max a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+; As we are not matching directly i64 code patterns on RV32 some i64 patterns
+; don't have yet any matching bit manipulation instructions on RV32.
+; This test is presented here in case future expansions of the Bitmanip
+; extensions introduce instructions suitable for this pattern.
+
+define i64 @max_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: max_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    beq a1, a3, .LBB11_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    slt a4, a3, a1
+; CHECK-NEXT:    beqz a4, .LBB11_3
+; CHECK-NEXT:    j .LBB11_4
+; CHECK-NEXT:  .LBB11_2:
+; CHECK-NEXT:    sltu a4, a2, a0
+; CHECK-NEXT:    bnez a4, .LBB11_4
+; CHECK-NEXT:  .LBB11_3:
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    mv a1, a3
+; CHECK-NEXT:  .LBB11_4:
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+define i32 @minu_i32(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: minu_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+; As we are not matching directly i64 code patterns on RV32 some i64 patterns
+; don't have yet any matching bit manipulation instructions on RV32.
+; This test is presented here in case future expansions of the Bitmanip
+; extensions introduce instructions suitable for this pattern.
+
+define i64 @minu_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: minu_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    beq a1, a3, .LBB13_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    beqz a4, .LBB13_3
+; CHECK-NEXT:    j .LBB13_4
+; CHECK-NEXT:  .LBB13_2:
+; CHECK-NEXT:    sltu a4, a0, a2
+; CHECK-NEXT:    bnez a4, .LBB13_4
+; CHECK-NEXT:  .LBB13_3:
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    mv a1, a3
+; CHECK-NEXT:  .LBB13_4:
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+define i32 @maxu_i32(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: maxu_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maxu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+; As we are not matching directly i64 code patterns on RV32 some i64 patterns
+; don't have yet any matching bit manipulation instructions on RV32.
+; This test is presented here in case future expansions of the Bitmanip
+; extensions introduce instructions suitable for this pattern.
+
+define i64 @maxu_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: maxu_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    beq a1, a3, .LBB15_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    sltu a4, a3, a1
+; CHECK-NEXT:    beqz a4, .LBB15_3
+; CHECK-NEXT:    j .LBB15_4
+; CHECK-NEXT:  .LBB15_2:
+; CHECK-NEXT:    sltu a4, a2, a0
+; CHECK-NEXT:    bnez a4, .LBB15_4
+; CHECK-NEXT:  .LBB15_3:
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    mv a1, a3
+; CHECK-NEXT:  .LBB15_4:
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+declare i32 @llvm.abs.i32(i32, i1 immarg)
+
+define i32 @abs_i32(i32 %x) {
+; CHECK-LABEL: abs_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    abs a0, a0
+; CHECK-NEXT:    ret
+  %abs = tail call i32 @llvm.abs.i32(i32 %x, i1 true)
+  ret i32 %abs
+}
+
+declare i64 @llvm.abs.i64(i64, i1 immarg)
+
+define i64 @abs_i64(i64 %x) {
+; CHECK-LABEL: abs_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    bgez a1, .LBB17_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    snez a2, a0
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    sub a1, a1, a2
+; CHECK-NEXT:  .LBB17_2:
+; CHECK-NEXT:    ret
+  %abs = tail call i64 @llvm.abs.i64(i64 %x, i1 true)
+  ret i64 %abs
+}
+
+define i32 @zexth_i32(i32 %a) nounwind {
+; CHECK-LABEL: zexth_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 16
+; CHECK-NEXT:    srli a0, a0, 16
+; CHECK-NEXT:    ret
+  %and = and i32 %a, 65535
+  ret i32 %and
+}
+
+define i64 @zexth_i64(i64 %a) nounwind {
+; CHECK-LABEL: zexth_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 16
+; CHECK-NEXT:    srli a0, a0, 16
+; CHECK-NEXT:    li a1, 0
+; CHECK-NEXT:    ret
+  %and = and i64 %a, 65535
+  ret i64 %and
+}
+
+declare i32 @llvm.bswap.i32(i32)
+
+define i32 @bswap_i32(i32 %a) nounwind {
+; CHECK-LABEL: bswap_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rev8 a0, a0
+; CHECK-NEXT:    ret
+  %1 = tail call i32 @llvm.bswap.i32(i32 %a)
+  ret i32 %1
+}
+
+declare i64 @llvm.bswap.i64(i64)
+
+define i64 @bswap_i64(i64 %a) {
+; CHECK-LABEL: bswap_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rev8 a2, a1
+; CHECK-NEXT:    rev8 a1, a0
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    ret
+  %1 = call i64 @llvm.bswap.i64(i64 %a)
+  ret i64 %1
+}
+
+define i32 @srai_slli(i16 signext %0) {
+; CHECK-LABEL: srai_slli:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 25
+; CHECK-NEXT:    srai a0, a0, 31
+; CHECK-NEXT:    ret
+  %2 = shl i16 %0, 9
+  %sext = ashr i16 %2, 15
+  %3 = sext i16 %sext to i32
+  ret i32 %3
+}
+
+define i32 @srai_slli2(i16 signext %0) {
+; CHECK-LABEL: srai_slli2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 25
+; CHECK-NEXT:    srai a0, a0, 30
+; CHECK-NEXT:    ret
+  %2 = shl i16 %0, 9
+  %sext = ashr i16 %2, 14
+  %3 = sext i16 %sext to i32
+  ret i32 %3
+}
+define i8 @sub_if_uge_i8(i8 %x, i8 %y) {
+; CHECK-LABEL: sub_if_uge_i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    zext.b a2, a0
+; CHECK-NEXT:    sub a0, a0, a1
+; CHECK-NEXT:    zext.b a0, a0
+; CHECK-NEXT:    minu a0, a2, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i8 %x, %y
+  %select = select i1 %cmp, i8 0, i8 %y
+  %sub = sub nuw i8 %x, %select
+  ret i8 %sub
+}
+
+define i16 @sub_if_uge_i16(i16 %x, i16 %y) {
+; CHECK-LABEL: sub_if_uge_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a2, 16
+; CHECK-NEXT:    sub a1, a0, a1
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    and a1, a1, a2
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i16 %x, %y
+  %select = select i1 %cmp, i16 0, i16 %y
+  %sub = sub nuw i16 %x, %select
+  ret i16 %sub
+}
+
+define i32 @sub_if_uge_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: sub_if_uge_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sub a1, a0, a1
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x, %y
+  %select = select i1 %cmp, i32 0, i32 %y
+  %sub = sub nuw i32 %x, %select
+  ret i32 %sub
+}
+
+define i64 @sub_if_uge_i64(i64 %x, i64 %y) {
+; CHECK-LABEL: sub_if_uge_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    beq a1, a3, .LBB27_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    j .LBB27_3
+; CHECK-NEXT:  .LBB27_2:
+; CHECK-NEXT:    sltu a4, a0, a2
+; CHECK-NEXT:  .LBB27_3:
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    and a2, a4, a2
+; CHECK-NEXT:    sltu a4, a0, a2
+; CHECK-NEXT:    sub a1, a1, a3
+; CHECK-NEXT:    sub a1, a1, a4
+; CHECK-NEXT:    sub a0, a0, a2
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i64 %x, %y
+  %select = select i1 %cmp, i64 0, i64 %y
+  %sub = sub nuw i64 %x, %select
+  ret i64 %sub
+}
+
+define i128 @sub_if_uge_i128(i128 %x, i128 %y) {
+; CHECK-LABEL: sub_if_uge_i128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a3, 4(a1)
+; CHECK-NEXT:    lw a4, 8(a1)
+; CHECK-NEXT:    lw a5, 12(a1)
+; CHECK-NEXT:    lw a6, 4(a2)
+; CHECK-NEXT:    lw t0, 12(a2)
+; CHECK-NEXT:    lw a7, 8(a2)
+; CHECK-NEXT:    beq a5, t0, .LBB28_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    sltu t1, a5, t0
+; CHECK-NEXT:    j .LBB28_3
+; CHECK-NEXT:  .LBB28_2:
+; CHECK-NEXT:    sltu t1, a4, a7
+; CHECK-NEXT:  .LBB28_3:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    beq a3, a6, .LBB28_5
+; CHECK-NEXT:  # %bb.4:
+; CHECK-NEXT:    sltu t2, a3, a6
+; CHECK-NEXT:    j .LBB28_6
+; CHECK-NEXT:  .LBB28_5:
+; CHECK-NEXT:    sltu t2, a1, a2
+; CHECK-NEXT:  .LBB28_6:
+; CHECK-NEXT:    xor t3, a5, t0
+; CHECK-NEXT:    xor t4, a4, a7
+; CHECK-NEXT:    or t3, t4, t3
+; CHECK-NEXT:    beqz t3, .LBB28_8
+; CHECK-NEXT:  # %bb.7:
+; CHECK-NEXT:    mv t2, t1
+; CHECK-NEXT:  .LBB28_8:
+; CHECK-NEXT:    addi t3, t2, -1
+; CHECK-NEXT:    and t2, t3, t0
+; CHECK-NEXT:    and t0, t3, a2
+; CHECK-NEXT:    and t1, t3, a6
+; CHECK-NEXT:    sltu a2, a1, t0
+; CHECK-NEXT:    and a7, t3, a7
+; CHECK-NEXT:    mv a6, a2
+; CHECK-NEXT:    beq a3, t1, .LBB28_10
+; CHECK-NEXT:  # %bb.9:
+; CHECK-NEXT:    sltu a6, a3, t1
+; CHECK-NEXT:  .LBB28_10:
+; CHECK-NEXT:    sub t3, a4, a7
+; CHECK-NEXT:    sltu a4, a4, a7
+; CHECK-NEXT:    sub a5, a5, t2
+; CHECK-NEXT:    sub a3, a3, t1
+; CHECK-NEXT:    sub a1, a1, t0
+; CHECK-NEXT:    sltu a7, t3, a6
+; CHECK-NEXT:    sub a5, a5, a4
+; CHECK-NEXT:    sub a4, t3, a6
+; CHECK-NEXT:    sub a3, a3, a2
+; CHECK-NEXT:    sub a2, a5, a7
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    sw a3, 4(a0)
+; CHECK-NEXT:    sw a4, 8(a0)
+; CHECK-NEXT:    sw a2, 12(a0)
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i128 %x, %y
+  %select = select i1 %cmp, i128 0, i128 %y
+  %sub = sub nuw i128 %x, %select
+  ret i128 %sub
+}
+
+define i32 @sub_if_uge_multiuse_select_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: sub_if_uge_multiuse_select_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sltu a2, a0, a1
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a1, a2, a1
+; CHECK-NEXT:    sub a0, a0, a1
+; CHECK-NEXT:    sll a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x, %y
+  %select = select i1 %cmp, i32 0, i32 %y
+  %sub = sub nuw i32 %x, %select
+  %shl = shl i32 %sub, %select
+  ret i32 %shl
+}
+
+define i32 @sub_if_uge_multiuse_cmp_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: sub_if_uge_multiuse_cmp_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    minu a2, a0, a2
+; CHECK-NEXT:    bltu a0, a1, .LBB30_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 4
+; CHECK-NEXT:    sll a0, a2, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB30_2:
+; CHECK-NEXT:    li a0, 2
+; CHECK-NEXT:    sll a0, a2, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x, %y
+  %select = select i1 %cmp, i32 0, i32 %y
+  %sub = sub nuw i32 %x, %select
+  %select2 = select i1 %cmp, i32 2, i32 4
+  %shl = shl i32 %sub, %select2
+  ret i32 %shl
+}
+
+define i32 @sub_if_uge_multiuse_cmp_store_i32(i32 %x, i32 %y, ptr %z) {
+; CHECK-LABEL: sub_if_uge_multiuse_cmp_store_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sltu a3, a0, a1
+; CHECK-NEXT:    sub a1, a0, a1
+; CHECK-NEXT:    xori a3, a3, 1
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    sw a3, 0(a2)
+; CHECK-NEXT:    ret
+  %cmp = icmp uge i32 %x, %y
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, ptr %z, align 4
+  %select = select i1 %cmp, i32 %y, i32 0
+  %sub = sub nuw i32 %x, %select
+  ret i32 %sub
+}
+
+define i8 @sub_if_uge_C_i8(i8 zeroext %x) {
+; CHECK-LABEL: sub_if_uge_C_i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -13
+; CHECK-NEXT:    zext.b a1, a1
+; CHECK-NEXT:    minu a0, a1, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i8 %x, 12
+  %sub = add i8 %x, -13
+  %conv4 = select i1 %cmp, i8 %sub, i8 %x
+  ret i8 %conv4
+}
+
+define i16 @sub_if_uge_C_i16(i16 zeroext %x) {
+; CHECK-LABEL: sub_if_uge_C_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -251
+; CHECK-NEXT:    slli a1, a1, 16
+; CHECK-NEXT:    srli a1, a1, 16
+; CHECK-NEXT:    minu a0, a1, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i16 %x, 250
+  %sub = add i16 %x, -251
+  %conv4 = select i1 %cmp, i16 %sub, i16 %x
+  ret i16 %conv4
+}
+
+define i32 @sub_if_uge_C_i32(i32 signext %x) {
+; CHECK-LABEL: sub_if_uge_C_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 1048560
+; CHECK-NEXT:    addi a1, a1, 15
+; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    minu a0, a1, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i32 %x, 65520
+  %sub = add i32 %x, -65521
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  ret i32 %cond
+}
+
+define i64 @sub_if_uge_C_i64(i64 %x) {
+; CHECK-LABEL: sub_if_uge_C_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 1
+; CHECK-NEXT:    beq a1, a2, .LBB35_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    sltiu a2, a1, 2
+; CHECK-NEXT:    xori a2, a2, 1
+; CHECK-NEXT:    j .LBB35_3
+; CHECK-NEXT:  .LBB35_2:
+; CHECK-NEXT:    lui a2, 172127
+; CHECK-NEXT:    addi a2, a2, 511
+; CHECK-NEXT:    sltu a2, a2, a0
+; CHECK-NEXT:  .LBB35_3:
+; CHECK-NEXT:    neg a2, a2
+; CHECK-NEXT:    andi a3, a2, -2
+; CHECK-NEXT:    add a1, a1, a3
+; CHECK-NEXT:    lui a3, 876449
+; CHECK-NEXT:    addi a3, a3, -512
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    sltu a0, a2, a0
+; CHECK-NEXT:    add a1, a1, a0
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i64 %x, 4999999999
+  %sub = add i64 %x, -5000000000
+  %cond = select i1 %cmp, i64 %sub, i64 %x
+  ret i64 %cond
+}
+
+define i32 @sub_if_uge_C_multiuse_cmp_i32(i32 signext %x, ptr %z) {
+; CHECK-LABEL: sub_if_uge_C_multiuse_cmp_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a2, 16
+; CHECK-NEXT:    lui a3, 1048560
+; CHECK-NEXT:    addi a2, a2, -16
+; CHECK-NEXT:    addi a3, a3, 15
+; CHECK-NEXT:    sltu a2, a2, a0
+; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    minu a0, a3, a0
+; CHECK-NEXT:    sw a2, 0(a1)
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i32 %x, 65520
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, ptr %z, align 4
+  %sub = add i32 %x, -65521
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  ret i32 %cond
+}
+
+define i32 @sub_if_uge_C_multiuse_sub_i32(i32 signext %x, ptr %z) {
+; CHECK-LABEL: sub_if_uge_C_multiuse_sub_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a2, 1048560
+; CHECK-NEXT:    addi a2, a2, 15
+; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    minu a0, a2, a0
+; CHECK-NEXT:    sw a2, 0(a1)
+; CHECK-NEXT:    ret
+  %sub = add i32 %x, -65521
+  store i32 %sub, ptr %z, align 4
+  %cmp = icmp ugt i32 %x, 65520
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  ret i32 %cond
+}
+
+define i32 @sub_if_uge_C_swapped_i32(i32 %x) {
+; CHECK-LABEL: sub_if_uge_C_swapped_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 1048560
+; CHECK-NEXT:    addi a1, a1, 15
+; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x, 65521
+  %sub = add i32 %x, -65521
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %cond
+}
+
+define i7 @sub_if_uge_C_nsw_i7(i7 %a) {
+; CHECK-LABEL: sub_if_uge_C_nsw_i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ori a0, a0, 51
+; CHECK-NEXT:    andi a1, a0, 127
+; CHECK-NEXT:    addi a0, a0, 17
+; CHECK-NEXT:    andi a0, a0, 92
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %x = or i7 %a, 51
+  %c = icmp ugt i7 %x, -18
+  %add = add nsw i7 %x, 17
+  %s = select i1 %c, i7 %add, i7 %x
+  ret i7 %s
+}
+
+define i7 @sub_if_uge_C_swapped_nsw_i7(i7 %a) {
+; CHECK-LABEL: sub_if_uge_C_swapped_nsw_i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ori a0, a0, 51
+; CHECK-NEXT:    andi a1, a0, 127
+; CHECK-NEXT:    addi a0, a0, 17
+; CHECK-NEXT:    andi a0, a0, 92
+; CHECK-NEXT:    minu a0, a1, a0
+; CHECK-NEXT:    ret
+  %x = or i7 %a, 51
+  %c = icmp ult i7 %x, -17
+  %add = add nsw i7 %x, 17
+  %s = select i1 %c, i7 %x, i7 %add
+  ret i7 %s
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64p.ll b/llvm/test/CodeGen/RISCV/rv64p.ll
new file mode 100644
index 0000000..cb07f94
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv64p.ll
@@ -0,0 +1,677 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-p -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s
+
+declare i32 @llvm.ctlz.i32(i32, i1)
+
+define signext i32 @ctlz_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: ctlz_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    clzw a0, a0
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+  ret i32 %1
+}
+
+define signext i32 @log2_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: log2_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    clzw a0, a0
+; CHECK-NEXT:    li a1, 31
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+  %2 = sub i32 31, %1
+  ret i32 %2
+}
+
+define signext i32 @log2_ceil_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: log2_ceil_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    clzw a0, a0
+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+  %1 = sub i32 %a, 1
+  %2 = call i32 @llvm.ctlz.i32(i32 %1, i1 false)
+  %3 = sub i32 32, %2
+  ret i32 %3
+}
+
+define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: findLastSet_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    clzw a1, a0
+; CHECK-NEXT:    snez a0, a0
+; CHECK-NEXT:    xori a1, a1, 31
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    or a0, a0, a1
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.ctlz.i32(i32 %a, i1 true)
+  %2 = xor i32 31, %1
+  %3 = icmp eq i32 %a, 0
+  %4 = select i1 %3, i32 -1, i32 %2
+  ret i32 %4
+}
+
+define i32 @ctlz_lshr_i32(i32 signext %a) {
+; CHECK-LABEL: ctlz_lshr_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    srliw a0, a0, 1
+; CHECK-NEXT:    clzw a0, a0
+; CHECK-NEXT:    ret
+  %1 = lshr i32 %a, 1
+  %2 = call i32 @llvm.ctlz.i32(i32 %1, i1 false)
+  ret i32 %2
+}
+
+declare i64 @llvm.ctlz.i64(i64, i1)
+
+define i64 @ctlz_i64(i64 %a) nounwind {
+; CHECK-LABEL: ctlz_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    clz a0, a0
+; CHECK-NEXT:    ret
+  %1 = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
+  ret i64 %1
+}
+
+declare i32 @llvm.cttz.i32(i32, i1)
+
+define signext i32 @cttz_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: cttz_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    beqz a0, .LBB6_2
+; CHECK-NEXT:  # %bb.1: # %cond.false
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    not a0, a0
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    clzw a0, a0
+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB6_2:
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+  ret i32 %1
+}
+
+define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: cttz_zero_undef_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    not a0, a0
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    clzw a0, a0
+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.cttz.i32(i32 %a, i1 true)
+  ret i32 %1
+}
+
+define signext i32 @findFirstSet_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: findFirstSet_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    not a2, a0
+; CHECK-NEXT:    and a1, a2, a1
+; CHECK-NEXT:    li a2, 32
+; CHECK-NEXT:    snez a0, a0
+; CHECK-NEXT:    clzw a1, a1
+; CHECK-NEXT:    sub a2, a2, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    or a0, a0, a2
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.cttz.i32(i32 %a, i1 true)
+  %2 = icmp eq i32 %a, 0
+  %3 = select i1 %2, i32 -1, i32 %1
+  ret i32 %3
+}
+
+define signext i32 @ffs_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: ffs_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    not a2, a0
+; CHECK-NEXT:    and a1, a2, a1
+; CHECK-NEXT:    li a2, 33
+; CHECK-NEXT:    seqz a0, a0
+; CHECK-NEXT:    clzw a1, a1
+; CHECK-NEXT:    sub a2, a2, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.cttz.i32(i32 %a, i1 true)
+  %2 = add i32 %1, 1
+  %3 = icmp eq i32 %a, 0
+  %4 = select i1 %3, i32 0, i32 %2
+  ret i32 %4
+}
+
+declare i64 @llvm.cttz.i64(i64, i1)
+
+define i64 @cttz_i64(i64 %a) nounwind {
+; CHECK-LABEL: cttz_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    beqz a0, .LBB10_2
+; CHECK-NEXT:  # %bb.1: # %cond.false
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    not a0, a0
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    clz a0, a0
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB10_2:
+; CHECK-NEXT:    li a0, 64
+; CHECK-NEXT:    ret
+  %1 = call i64 @llvm.cttz.i64(i64 %a, i1 false)
+  ret i64 %1
+}
+
+define signext i32 @sextb_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: sextb_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.b a0, a0
+; CHECK-NEXT:    ret
+  %shl = shl i32 %a, 24
+  %shr = ashr exact i32 %shl, 24
+  ret i32 %shr
+}
+
+define i64 @sextb_i64(i64 %a) nounwind {
+; CHECK-LABEL: sextb_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.b a0, a0
+; CHECK-NEXT:    ret
+  %shl = shl i64 %a, 56
+  %shr = ashr exact i64 %shl, 56
+  ret i64 %shr
+}
+
+define signext i32 @sexth_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: sexth_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.h a0, a0
+; CHECK-NEXT:    ret
+  %shl = shl i32 %a, 16
+  %shr = ashr exact i32 %shl, 16
+  ret i32 %shr
+}
+
+define i64 @sexth_i64(i64 %a) nounwind {
+; CHECK-LABEL: sexth_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.h a0, a0
+; CHECK-NEXT:    ret
+  %shl = shl i64 %a, 48
+  %shr = ashr exact i64 %shl, 48
+  ret i64 %shr
+}
+
+define signext i32 @min_i32(i32 signext %a, i32 signext %b) nounwind {
+; CHECK-LABEL: min_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    min a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+define i64 @min_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: min_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    min a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+define signext i32 @max_i32(i32 signext %a, i32 signext %b) nounwind {
+; CHECK-LABEL: max_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    max a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+define i64 @max_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: max_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    max a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+define signext i32 @minu_i32(i32 signext %a, i32 signext %b) nounwind {
+; CHECK-LABEL: minu_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+define i64 @minu_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: minu_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+define signext i32 @maxu_i32(i32 signext %a, i32 signext %b) nounwind {
+; CHECK-LABEL: maxu_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maxu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+define i64 @maxu_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: maxu_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maxu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+declare i32 @llvm.abs.i32(i32, i1 immarg)
+
+define i32 @abs_i32(i32 %x) {
+; CHECK-LABEL: abs_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.w a0, a0
+; CHECK-NEXT:    abs a0, a0
+; CHECK-NEXT:    ret
+  %abs = tail call i32 @llvm.abs.i32(i32 %x, i1 true)
+  ret i32 %abs
+}
+
+define signext i32 @abs_i32_sext(i32 signext %x) {
+; CHECK-LABEL: abs_i32_sext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    abs a0, a0
+; CHECK-NEXT:    sext.w a0, a0
+; CHECK-NEXT:    ret
+  %abs = tail call i32 @llvm.abs.i32(i32 %x, i1 true)
+  ret i32 %abs
+}
+
+declare i64 @llvm.abs.i64(i64, i1 immarg)
+
+define i64 @abs_i64(i64 %x) {
+; CHECK-LABEL: abs_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    abs a0, a0
+; CHECK-NEXT:    ret
+  %abs = tail call i64 @llvm.abs.i64(i64 %x, i1 true)
+  ret i64 %abs
+}
+
+declare i32 @llvm.bswap.i32(i32)
+
+define signext i32 @bswap_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: bswap_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rev8 a0, a0
+; CHECK-NEXT:    srai a0, a0, 32
+; CHECK-NEXT:    ret
+  %1 = tail call i32 @llvm.bswap.i32(i32 %a)
+  ret i32 %1
+}
+
+; Similar to bswap_i32 but the result is not sign extended.
+define void @bswap_i32_nosext(i32 signext %a, ptr %x) nounwind {
+; CHECK-LABEL: bswap_i32_nosext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rev8 a0, a0
+; CHECK-NEXT:    srli a0, a0, 32
+; CHECK-NEXT:    sw a0, 0(a1)
+; CHECK-NEXT:    ret
+  %1 = tail call i32 @llvm.bswap.i32(i32 %a)
+  store i32 %1, ptr %x
+  ret void
+}
+
+declare i64 @llvm.bswap.i64(i64)
+
+define i64 @bswap_i64(i64 %a) {
+; CHECK-LABEL: bswap_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rev8 a0, a0
+; CHECK-NEXT:    ret
+  %1 = call i64 @llvm.bswap.i64(i64 %a)
+  ret i64 %1
+}
+
+define i64 @srai_slli(i16 signext %0) {
+; CHECK-LABEL: srai_slli:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 57
+; CHECK-NEXT:    srai a0, a0, 63
+; CHECK-NEXT:    ret
+  %2 = shl i16 %0, 9
+  %sext = ashr i16 %2, 15
+  %3 = sext i16 %sext to i64
+  ret i64 %3
+}
+
+define i64 @srai_slli2(i16 signext %0) {
+; CHECK-LABEL: srai_slli2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 57
+; CHECK-NEXT:    srai a0, a0, 62
+; CHECK-NEXT:    ret
+  %2 = shl i16 %0, 9
+  %sext = ashr i16 %2, 14
+  %3 = sext i16 %sext to i64
+  ret i64 %3
+}
+
+define signext i32 @func0000000000000001(i32 signext %0, i8 signext %1) #0 {
+; CHECK-LABEL: func0000000000000001:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    slli a1, a1, 59
+; CHECK-NEXT:    srai a1, a1, 63
+; CHECK-NEXT:    addw a0, a1, a0
+; CHECK-NEXT:    ret
+entry:
+  %2 = shl i8 %1, 3
+  %3 = ashr i8 %2, 7
+  %4 = sext i8 %3 to i32
+  %5 = add nsw i32 %4, %0
+  ret i32 %5
+}
+
+define i8 @sub_if_uge_i8(i8 %x, i8 %y) {
+; CHECK-LABEL: sub_if_uge_i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    zext.b a2, a0
+; CHECK-NEXT:    sub a0, a0, a1
+; CHECK-NEXT:    zext.b a0, a0
+; CHECK-NEXT:    minu a0, a2, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i8 %x, %y
+  %select = select i1 %cmp, i8 0, i8 %y
+  %sub = sub nuw i8 %x, %select
+  ret i8 %sub
+}
+
+define i16 @sub_if_uge_i16(i16 %x, i16 %y) {
+; CHECK-LABEL: sub_if_uge_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a2, 16
+; CHECK-NEXT:    sub a1, a0, a1
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    and a1, a1, a2
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i16 %x, %y
+  %select = select i1 %cmp, i16 0, i16 %y
+  %sub = sub nuw i16 %x, %select
+  ret i16 %sub
+}
+
+define i32 @sub_if_uge_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: sub_if_uge_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.w a2, a0
+; CHECK-NEXT:    subw a0, a0, a1
+; CHECK-NEXT:    minu a0, a2, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x, %y
+  %select = select i1 %cmp, i32 0, i32 %y
+  %sub = sub nuw i32 %x, %select
+  ret i32 %sub
+}
+
+define i64 @sub_if_uge_i64(i64 %x, i64 %y) {
+; CHECK-LABEL: sub_if_uge_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sub a1, a0, a1
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i64 %x, %y
+  %select = select i1 %cmp, i64 0, i64 %y
+  %sub = sub nuw i64 %x, %select
+  ret i64 %sub
+}
+
+define i128 @sub_if_uge_i128(i128 %x, i128 %y) {
+; CHECK-LABEL: sub_if_uge_i128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    beq a1, a3, .LBB36_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    j .LBB36_3
+; CHECK-NEXT:  .LBB36_2:
+; CHECK-NEXT:    sltu a4, a0, a2
+; CHECK-NEXT:  .LBB36_3:
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    and a2, a4, a2
+; CHECK-NEXT:    sltu a4, a0, a2
+; CHECK-NEXT:    sub a1, a1, a3
+; CHECK-NEXT:    sub a1, a1, a4
+; CHECK-NEXT:    sub a0, a0, a2
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i128 %x, %y
+  %select = select i1 %cmp, i128 0, i128 %y
+  %sub = sub nuw i128 %x, %select
+  ret i128 %sub
+}
+
+define i32 @sub_if_uge_multiuse_select_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: sub_if_uge_multiuse_select_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.w a2, a1
+; CHECK-NEXT:    sext.w a3, a0
+; CHECK-NEXT:    sltu a2, a3, a2
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a1, a2, a1
+; CHECK-NEXT:    sub a0, a0, a1
+; CHECK-NEXT:    sllw a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x, %y
+  %select = select i1 %cmp, i32 0, i32 %y
+  %sub = sub nuw i32 %x, %select
+  %shl = shl i32 %sub, %select
+  ret i32 %shl
+}
+
+define i32 @sub_if_uge_multiuse_cmp_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: sub_if_uge_multiuse_cmp_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.w a2, a1
+; CHECK-NEXT:    sext.w a3, a0
+; CHECK-NEXT:    subw a0, a0, a1
+; CHECK-NEXT:    minu a0, a3, a0
+; CHECK-NEXT:    bltu a3, a2, .LBB38_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 4
+; CHECK-NEXT:    sllw a0, a0, a1
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB38_2:
+; CHECK-NEXT:    li a1, 2
+; CHECK-NEXT:    sllw a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x, %y
+  %select = select i1 %cmp, i32 0, i32 %y
+  %sub = sub nuw i32 %x, %select
+  %select2 = select i1 %cmp, i32 2, i32 4
+  %shl = shl i32 %sub, %select2
+  ret i32 %shl
+}
+
+define i32 @sub_if_uge_multiuse_cmp_store_i32(i32 signext %x, i32 signext %y, ptr %z) {
+; CHECK-LABEL: sub_if_uge_multiuse_cmp_store_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sltu a3, a0, a1
+; CHECK-NEXT:    subw a1, a0, a1
+; CHECK-NEXT:    xori a3, a3, 1
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    sw a3, 0(a2)
+; CHECK-NEXT:    ret
+  %cmp = icmp uge i32 %x, %y
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, ptr %z, align 4
+  %select = select i1 %cmp, i32 %y, i32 0
+  %sub = sub nuw i32 %x, %select
+  ret i32 %sub
+}
+
+define i8 @sub_if_uge_C_i8(i8 zeroext %x) {
+; CHECK-LABEL: sub_if_uge_C_i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -13
+; CHECK-NEXT:    zext.b a1, a1
+; CHECK-NEXT:    minu a0, a1, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i8 %x, 12
+  %sub = add i8 %x, -13
+  %conv4 = select i1 %cmp, i8 %sub, i8 %x
+  ret i8 %conv4
+}
+
+define i16 @sub_if_uge_C_i16(i16 zeroext %x) {
+; CHECK-LABEL: sub_if_uge_C_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -251
+; CHECK-NEXT:    slli a1, a1, 48
+; CHECK-NEXT:    srli a1, a1, 48
+; CHECK-NEXT:    minu a0, a1, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i16 %x, 250
+  %sub = add i16 %x, -251
+  %conv4 = select i1 %cmp, i16 %sub, i16 %x
+  ret i16 %conv4
+}
+
+define i32 @sub_if_uge_C_i32(i32 signext %x) {
+; CHECK-LABEL: sub_if_uge_C_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 1048560
+; CHECK-NEXT:    addi a1, a1, 15
+; CHECK-NEXT:    addw a1, a0, a1
+; CHECK-NEXT:    minu a0, a1, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i32 %x, 65520
+  %sub = add i32 %x, -65521
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  ret i32 %cond
+}
+
+define i64 @sub_if_uge_C_i64(i64 %x) {
+; CHECK-LABEL: sub_if_uge_C_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 1046192
+; CHECK-NEXT:    addi a1, a1, -761
+; CHECK-NEXT:    slli a1, a1, 9
+; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    minu a0, a1, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i64 %x, 4999999999
+  %sub = add i64 %x, -5000000000
+  %cond = select i1 %cmp, i64 %sub, i64 %x
+  ret i64 %cond
+}
+
+define i32 @sub_if_uge_C_multiuse_cmp_i32(i32 signext %x, ptr %z) {
+; CHECK-LABEL: sub_if_uge_C_multiuse_cmp_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a2, 16
+; CHECK-NEXT:    lui a3, 1048560
+; CHECK-NEXT:    addi a2, a2, -16
+; CHECK-NEXT:    addi a3, a3, 15
+; CHECK-NEXT:    sltu a2, a2, a0
+; CHECK-NEXT:    addw a3, a0, a3
+; CHECK-NEXT:    minu a0, a3, a0
+; CHECK-NEXT:    sw a2, 0(a1)
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i32 %x, 65520
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, ptr %z, align 4
+  %sub = add i32 %x, -65521
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  ret i32 %cond
+}
+
+define i32 @sub_if_uge_C_multiuse_sub_i32(i32 signext %x, ptr %z) {
+; CHECK-LABEL: sub_if_uge_C_multiuse_sub_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a2, 1048560
+; CHECK-NEXT:    addi a2, a2, 15
+; CHECK-NEXT:    addw a2, a0, a2
+; CHECK-NEXT:    minu a0, a2, a0
+; CHECK-NEXT:    sw a2, 0(a1)
+; CHECK-NEXT:    ret
+  %sub = add i32 %x, -65521
+  store i32 %sub, ptr %z, align 4
+  %cmp = icmp ugt i32 %x, 65520
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  ret i32 %cond
+}
+
+define i32 @sub_if_uge_C_swapped_i32(i32 signext %x) {
+; CHECK-LABEL: sub_if_uge_C_swapped_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 1048560
+; CHECK-NEXT:    addi a1, a1, 15
+; CHECK-NEXT:    addw a1, a0, a1
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x, 65521
+  %sub = add i32 %x, -65521
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %cond
+}
+
+define i7 @sub_if_uge_C_nsw_i7(i7 %a) {
+; CHECK-LABEL: sub_if_uge_C_nsw_i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ori a0, a0, 51
+; CHECK-NEXT:    andi a1, a0, 127
+; CHECK-NEXT:    addi a0, a0, 17
+; CHECK-NEXT:    andi a0, a0, 92
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %x = or i7 %a, 51
+  %c = icmp ugt i7 %x, -18
+  %add = add nsw i7 %x, 17
+  %s = select i1 %c, i7 %add, i7 %x
+  ret i7 %s
+}
+
+define i7 @sub_if_uge_C_swapped_nsw_i7(i7 %a) {
+; CHECK-LABEL: sub_if_uge_C_swapped_nsw_i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ori a0, a0, 51
+; CHECK-NEXT:    andi a1, a0, 127
+; CHECK-NEXT:    addi a0, a0, 17
+; CHECK-NEXT:    andi a0, a0, 92
+; CHECK-NEXT:    minu a0, a1, a0
+; CHECK-NEXT:    ret
+  %x = or i7 %a, 51
+  %c = icmp ult i7 %x, -17
+  %add = add nsw i7 %x, 17
+  %s = select i1 %c, i7 %x, i7 %add
+  ret i7 %s
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vfbfexp16e.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vfbfexp16e.ll
new file mode 100644
index 0000000..5c0c6c1
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vfbfexp16e.ll
@@ -0,0 +1,191 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfbfmin,+xsfvfbfexp16e \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfbfmin,+xsfvfbfexp16e \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+define <vscale x 1 x bfloat>  @intrinsic_sf_vfexp_v_nxv1bf16(<vscale x 1 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.nxv1bf16(
+    <vscale x 1 x bfloat> poison,
+    <vscale x 1 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 2 x bfloat>  @intrinsic_sf_vfexp_v_nxv2bf16(<vscale x 2 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.nxv2bf16(
+    <vscale x 2 x bfloat> poison,
+    <vscale x 2 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+define <vscale x 4 x bfloat>  @intrinsic_sf_vfexp_v_nxv4bf16(<vscale x 4 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.nxv4bf16(
+    <vscale x 4 x bfloat> poison,
+    <vscale x 4 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+define <vscale x 8 x bfloat>  @intrinsic_sf_vfexp_v_nxv8bf16(<vscale x 8 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.nxv8bf16(
+    <vscale x 8 x bfloat> poison,
+    <vscale x 8 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+define <vscale x 16 x bfloat>  @intrinsic_sf_vfexp_v_nxv16bf16(<vscale x 16 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.nxv16bf16(
+    <vscale x 16 x bfloat> poison,
+    <vscale x 16 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 32 x bfloat>  @intrinsic_sf_vfexp_v_nxv32bf16(<vscale x 32 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.nxv32bf16(
+    <vscale x 32 x bfloat> poison,
+    <vscale x 32 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat>  @intrinsic_sf_vfexp_mask_v_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv1bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 2 x bfloat>  @intrinsic_sf_vfexp_mask_v_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv2bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+define <vscale x 4 x bfloat>  @intrinsic_sf_vfexp_mask_v_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv4bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+define <vscale x 8 x bfloat>  @intrinsic_sf_vfexp_mask_v_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv8bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+define <vscale x 16 x bfloat>  @intrinsic_sf_vfexp_mask_v_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv16bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 32 x bfloat>  @intrinsic_sf_vfexp_mask_v_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, <vscale x 32 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv32bf16(
+    <vscale x 32 x bfloat> %0,
+    <vscale x 32 x bfloat> %1,
+    <vscale x 32 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 32 x bfloat> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vfexp16e.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vfexp16e.ll
new file mode 100644
index 0000000..2d97f73
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vfexp16e.ll
@@ -0,0 +1,191 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+xsfvfexp16e \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+xsfvfexp16e \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+define <vscale x 1 x half>  @intrinsic_sf_vfexp_v_nxv1f16(<vscale x 1 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv1f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.nxv1f16(
+    <vscale x 1 x half> poison,
+    <vscale x 1 x half> %0,
+    iXLen %1)
+
+  ret <vscale x 1 x half> %a
+}
+
+define <vscale x 2 x half>  @intrinsic_sf_vfexp_v_nxv2f16(<vscale x 2 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv2f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.nxv2f16(
+    <vscale x 2 x half> poison,
+    <vscale x 2 x half> %0,
+    iXLen %1)
+
+  ret <vscale x 2 x half> %a
+}
+
+define <vscale x 4 x half>  @intrinsic_sf_vfexp_v_nxv4f16(<vscale x 4 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv4f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.nxv4f16(
+    <vscale x 4 x half> poison,
+    <vscale x 4 x half> %0,
+    iXLen %1)
+
+  ret <vscale x 4 x half> %a
+}
+
+define <vscale x 8 x half>  @intrinsic_sf_vfexp_v_nxv8f16(<vscale x 8 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv8f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.nxv8f16(
+    <vscale x 8 x half> poison,
+    <vscale x 8 x half> %0,
+    iXLen %1)
+
+  ret <vscale x 8 x half> %a
+}
+
+define <vscale x 16 x half>  @intrinsic_sf_vfexp_v_nxv16f16(<vscale x 16 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv16f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.nxv16f16(
+    <vscale x 16 x half> poison,
+    <vscale x 16 x half> %0,
+    iXLen %1)
+
+  ret <vscale x 16 x half> %a
+}
+
+define <vscale x 32 x half>  @intrinsic_sf_vfexp_v_nxv32f16(<vscale x 32 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv32f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.nxv32f16(
+    <vscale x 32 x half> poison,
+    <vscale x 32 x half> %0,
+    iXLen %1)
+
+  ret <vscale x 32 x half> %a
+}
+
+define <vscale x 1 x half>  @intrinsic_sf_vfexp_mask_v_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv1f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.mask.nxv1f16(
+    <vscale x 1 x half> %0,
+    <vscale x 1 x half> %1,
+    <vscale x 1 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 1 x half> %a
+}
+
+define <vscale x 2 x half>  @intrinsic_sf_vfexp_mask_v_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv2f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.mask.nxv2f16(
+    <vscale x 2 x half> %0,
+    <vscale x 2 x half> %1,
+    <vscale x 2 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 2 x half> %a
+}
+
+define <vscale x 4 x half>  @intrinsic_sf_vfexp_mask_v_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv4f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.mask.nxv4f16(
+    <vscale x 4 x half> %0,
+    <vscale x 4 x half> %1,
+    <vscale x 4 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 4 x half> %a
+}
+
+define <vscale x 8 x half>  @intrinsic_sf_vfexp_mask_v_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv8f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.mask.nxv8f16(
+    <vscale x 8 x half> %0,
+    <vscale x 8 x half> %1,
+    <vscale x 8 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 8 x half> %a
+}
+
+define <vscale x 16 x half>  @intrinsic_sf_vfexp_mask_v_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv16f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.mask.nxv16f16(
+    <vscale x 16 x half> %0,
+    <vscale x 16 x half> %1,
+    <vscale x 16 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 16 x half> %a
+}
+
+define <vscale x 32 x half>  @intrinsic_sf_vfexp_mask_v_nxv32f16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, <vscale x 32 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv32f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.mask.nxv32f16(
+    <vscale x 32 x half> %0,
+    <vscale x 32 x half> %1,
+    <vscale x 32 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 32 x half> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vfexp32e.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vfexp32e.ll
new file mode 100644
index 0000000..46dce14
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vfexp32e.ll
@@ -0,0 +1,160 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+xsfvfexp32e \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+xsfvfexp32e \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+define <vscale x 1 x float>  @intrinsic_sf_vfexp_v_nxv1f32(<vscale x 1 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.nxv1f32(
+    <vscale x 1 x float> poison,
+    <vscale x 1 x float> %0,
+    iXLen %1)
+
+  ret <vscale x 1 x float> %a
+}
+
+define <vscale x 2 x float>  @intrinsic_sf_vfexp_v_nxv2f32(<vscale x 2 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv2f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.nxv2f32(
+    <vscale x 2 x float> poison,
+    <vscale x 2 x float> %0,
+    iXLen %1)
+
+  ret <vscale x 2 x float> %a
+}
+
+define <vscale x 4 x float>  @intrinsic_sf_vfexp_v_nxv4f32(<vscale x 4 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.nxv4f32(
+    <vscale x 4 x float> poison,
+    <vscale x 4 x float> %0,
+    iXLen %1)
+
+  ret <vscale x 4 x float> %a
+}
+
+define <vscale x 8 x float>  @intrinsic_sf_vfexp_v_nxv8f32(<vscale x 8 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.nxv8f32(
+    <vscale x 8 x float> poison,
+    <vscale x 8 x float> %0,
+    iXLen %1)
+
+  ret <vscale x 8 x float> %a
+}
+
+define <vscale x 16 x float>  @intrinsic_sf_vfexp_v_nxv16f32(<vscale x 16 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv16f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.nxv16f32(
+    <vscale x 16 x float> poison,
+    <vscale x 16 x float> %0,
+    iXLen %1)
+
+  ret <vscale x 16 x float> %a
+}
+
+define <vscale x 1 x float>  @intrinsic_sf_vfexp_mask_v_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.mask.nxv1f32(
+    <vscale x 1 x float> %0,
+    <vscale x 1 x float> %1,
+    <vscale x 1 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 1 x float> %a
+}
+
+define <vscale x 2 x float>  @intrinsic_sf_vfexp_mask_v_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv2f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.mask.nxv2f32(
+    <vscale x 2 x float> %0,
+    <vscale x 2 x float> %1,
+    <vscale x 2 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 2 x float> %a
+}
+
+define <vscale x 4 x float>  @intrinsic_sf_vfexp_mask_v_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.mask.nxv4f32(
+    <vscale x 4 x float> %0,
+    <vscale x 4 x float> %1,
+    <vscale x 4 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 4 x float> %a
+}
+
+define <vscale x 8 x float>  @intrinsic_sf_vfexp_mask_v_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.mask.nxv8f32(
+    <vscale x 8 x float> %0,
+    <vscale x 8 x float> %1,
+    <vscale x 8 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 8 x float> %a
+}
+
+define <vscale x 16 x float>  @intrinsic_sf_vfexp_mask_v_nxv16f32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, <vscale x 16 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv16f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.mask.nxv16f32(
+    <vscale x 16 x float> %0,
+    <vscale x 16 x float> %1,
+    <vscale x 16 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 16 x float> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vfexpa.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vfexpa.ll
new file mode 100644
index 0000000..d3d10d2
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vfexpa.ll
@@ -0,0 +1,335 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+zve64f,+zvfh,+xsfvfexpa \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+zve64f,+zvfh,+xsfvfexpa \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+define <vscale x 1 x float> @test_intrinsic_sf_vfexpa_v_nxv1f32(<vscale x 1 x float> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.nxv1f32(
+      <vscale x 1 x float> poison,
+      <vscale x 1 x float> %0,
+      iXLen %1)
+    ret <vscale x 1 x float> %f
+}
+
+define <vscale x 2 x float> @test_intrinsic_sf_vfexpa_v_nxv2f32(<vscale x 2 x float> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv2f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.nxv2f32(
+      <vscale x 2 x float> poison,
+      <vscale x 2 x float> %0,
+      iXLen %1)
+    ret <vscale x 2 x float> %f
+}
+
+define <vscale x 4 x float> @test_intrinsic_sf_vfexpa_v_nxv4f32(<vscale x 4 x float> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.nxv4f32(
+      <vscale x 4 x float> poison,
+      <vscale x 4 x float> %0,
+      iXLen %1)
+    ret <vscale x 4 x float> %f
+}
+
+define <vscale x 8 x float> @test_intrinsic_sf_vfexpa_v_nxv8f32(<vscale x 8 x float> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.nxv8f32(
+      <vscale x 8 x float> poison,
+      <vscale x 8 x float> %0,
+      iXLen %1)
+    ret <vscale x 8 x float> %f
+}
+
+define <vscale x 16 x float> @test_intrinsic_sf_vfexpa_v_nxv16f32(<vscale x 16 x float> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv16f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.nxv16f32(
+      <vscale x 16 x float> poison,
+      <vscale x 16 x float> %0,
+      iXLen %1)
+    ret <vscale x 16 x float> %f
+}
+
+define <vscale x 1 x float> @test_intrinsic_sf_vfexpa_v_mask_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.mask.nxv1f32(
+      <vscale x 1 x float> %0,
+      <vscale x 1 x float> %1,
+      <vscale x 1 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 1 x float> %f
+}
+
+define <vscale x 2 x float> @test_intrinsic_sf_vfexpa_v_mask_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv2f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.mask.nxv2f32(
+      <vscale x 2 x float> %0,
+      <vscale x 2 x float> %1,
+      <vscale x 2 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 2 x float> %f
+}
+
+define <vscale x 4 x float> @test_intrinsic_sf_vfexpa_v_mask_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.mask.nxv4f32(
+      <vscale x 4 x float> %0,
+      <vscale x 4 x float> %1,
+      <vscale x 4 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 4 x float> %f
+}
+
+define <vscale x 8 x float> @test_intrinsic_sf_vfexpa_v_mask_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.mask.nxv8f32(
+      <vscale x 8 x float> %0,
+      <vscale x 8 x float> %1,
+      <vscale x 8 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 8 x float> %f
+}
+
+define <vscale x 16 x float> @test_intrinsic_sf_vfexpa_v_mask_nxv16f32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, <vscale x 16 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv16f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.mask.nxv16f32(
+      <vscale x 16 x float> %0,
+      <vscale x 16 x float> %1,
+      <vscale x 16 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 16 x float> %f
+}
+
+define <vscale x 1 x half> @test_intrinsic_sf_vfexpa_v_nxv1f16(<vscale x 1 x half> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv1f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.nxv1f16(
+      <vscale x 1 x half> poison,
+      <vscale x 1 x half> %0,
+      iXLen %1)
+    ret <vscale x 1 x half> %f
+}
+
+define <vscale x 2 x half> @test_intrinsic_sf_vfexpa_v_nxv2f16(<vscale x 2 x half> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv2f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.nxv2f16(
+      <vscale x 2 x half> poison,
+      <vscale x 2 x half> %0,
+      iXLen %1)
+    ret <vscale x 2 x half> %f
+}
+
+define <vscale x 4 x half> @test_intrinsic_sf_vfexpa_v_nxv4f16(<vscale x 4 x half> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv4f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.nxv4f16(
+      <vscale x 4 x half> poison,
+      <vscale x 4 x half> %0,
+      iXLen %1)
+    ret <vscale x 4 x half> %f
+}
+
+define <vscale x 8 x half> @test_intrinsic_sf_vfexpa_v_nxv8f16(<vscale x 8 x half> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv8f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.nxv8f16(
+      <vscale x 8 x half> poison,
+      <vscale x 8 x half> %0,
+      iXLen %1)
+    ret <vscale x 8 x half> %f
+}
+
+define <vscale x 16 x half> @test_intrinsic_sf_vfexpa_v_nxv16f16(<vscale x 16 x half> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv16f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.nxv16f16(
+      <vscale x 16 x half> poison,
+      <vscale x 16 x half> %0,
+      iXLen %1)
+    ret <vscale x 16 x half> %f
+}
+
+define <vscale x 32 x half> @test_intrinsic_sf_vfexpa_v_nxv32f16(<vscale x 32 x half> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv32f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.nxv32f16(
+      <vscale x 32 x half> poison,
+      <vscale x 32 x half> %0,
+      iXLen %1)
+    ret <vscale x 32 x half> %f
+}
+
+define <vscale x 1 x half> @test_intrinsic_sf_vfexpa_v_mask_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv1f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.mask.nxv1f16(
+      <vscale x 1 x half> %0,
+      <vscale x 1 x half> %1,
+      <vscale x 1 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 1 x half> %f
+}
+
+define <vscale x 2 x half> @test_intrinsic_sf_vfexpa_v_mask_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv2f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.mask.nxv2f16(
+      <vscale x 2 x half> %0,
+      <vscale x 2 x half> %1,
+      <vscale x 2 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 2 x half> %f
+}
+
+define <vscale x 4 x half> @test_intrinsic_sf_vfexpa_v_mask_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv4f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.mask.nxv4f16(
+      <vscale x 4 x half> %0,
+      <vscale x 4 x half> %1,
+      <vscale x 4 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 4 x half> %f
+}
+
+define <vscale x 8 x half> @test_intrinsic_sf_vfexpa_v_mask_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv8f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.mask.nxv8f16(
+      <vscale x 8 x half> %0,
+      <vscale x 8 x half> %1,
+      <vscale x 8 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 8 x half> %f
+}
+
+define <vscale x 16 x half> @test_intrinsic_sf_vfexpa_v_mask_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv16f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.mask.nxv16f16(
+      <vscale x 16 x half> %0,
+      <vscale x 16 x half> %1,
+      <vscale x 16 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 16 x half> %f
+}
+
+define <vscale x 32 x half> @test_intrinsic_sf_vfexpa_v_mask_nxv32f16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, <vscale x 32 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv32f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.mask.nxv32f16(
+      <vscale x 32 x half> %0,
+      <vscale x 32 x half> %1,
+      <vscale x 32 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 32 x half> %f
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vfexpa64e.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vfexpa64e.ll
new file mode 100644
index 0000000..3de0e93
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vfexpa64e.ll
@@ -0,0 +1,125 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+xsfvfexpa64e \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+xsfvfexpa64e \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+define <vscale x 1 x double> @test_intrinsic_sf_vfexpa_v_nxv1f64(<vscale x 1 x double> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv1f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.nxv1f64(
+      <vscale x 1 x double> poison,
+      <vscale x 1 x double> %0,
+      iXLen %1)
+    ret <vscale x 1 x double> %f
+}
+
+define <vscale x 2 x double> @test_intrinsic_sf_vfexpa_v_nxv2f64(<vscale x 2 x double> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.nxv2f64(
+      <vscale x 2 x double> poison,
+      <vscale x 2 x double> %0,
+      iXLen %1)
+    ret <vscale x 2 x double> %f
+}
+
+define <vscale x 4 x double> @test_intrinsic_sf_vfexpa_v_nxv4f64(<vscale x 4 x double> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.nxv4f64(
+      <vscale x 4 x double> poison,
+      <vscale x 4 x double> %0,
+      iXLen %1)
+    ret <vscale x 4 x double> %f
+}
+
+define <vscale x 8 x double> @test_intrinsic_sf_vfexpa_v_nxv8f64(<vscale x 8 x double> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv8f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.nxv8f64(
+      <vscale x 8 x double> poison,
+      <vscale x 8 x double> %0,
+      iXLen %1)
+    ret <vscale x 8 x double> %f
+}
+
+define <vscale x 1 x double> @test_intrinsic_sf_vfexpa_v_mask_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv1f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.mask.nxv1f64(
+      <vscale x 1 x double> %0,
+      <vscale x 1 x double> %1,
+      <vscale x 1 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 1 x double> %f
+}
+
+define <vscale x 2 x double> @test_intrinsic_sf_vfexpa_v_mask_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.mask.nxv2f64(
+      <vscale x 2 x double> %0,
+      <vscale x 2 x double> %1,
+      <vscale x 2 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 2 x double> %f
+}
+
+define <vscale x 4 x double> @test_intrinsic_sf_vfexpa_v_mask_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.mask.nxv4f64(
+      <vscale x 4 x double> %0,
+      <vscale x 4 x double> %1,
+      <vscale x 4 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 4 x double> %f
+}
+
+define <vscale x 8 x double> @test_intrinsic_sf_vfexpa_v_mask_nxv8f64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, <vscale x 8 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv8f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.mask.nxv8f64(
+      <vscale x 8 x double> %0,
+      <vscale x 8 x double> %1,
+      <vscale x 8 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 8 x double> %f
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll
index 061b2b0..abd00b6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll
@@ -11,33 +11,80 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
 ; RUN:     -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
 ; RUN:     --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+zvfh,+experimental-zvfbfa,+v \
+; RUN:     -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN:     --check-prefixes=CHECK,ZVFBFA
 
 define <vscale x 1 x bfloat> @vfadd_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb) {
-; CHECK-LABEL: vfadd_vv_nxv1bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfadd.vv v9, v9, v10
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv1bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFH-NEXT:    vfadd.vv v9, v9, v10
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv1bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv1bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v9, v10
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %vc = fadd <vscale x 1 x bfloat> %va, %vb
   ret <vscale x 1 x bfloat> %vc
 }
 
 define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b) {
-; CHECK-LABEL: vfadd_vf_nxv1bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfadd.vf v9, v9, fa5
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv1bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFH-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFH-NEXT:    vfadd.vf v9, v9, fa5
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv1bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vf v9, v9, fa5
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v9, v9, fa5
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
   %splat = shufflevector <vscale x 1 x bfloat> %head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
   %vc = fadd <vscale x 1 x bfloat> %va, %splat
@@ -45,31 +92,75 @@ define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloa
 }
 
 define <vscale x 2 x bfloat> @vfadd_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb) {
-; CHECK-LABEL: vfadd_vv_nxv2bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfadd.vv v9, v9, v10
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv2bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFH-NEXT:    vfadd.vv v9, v9, v10
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv2bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv2bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v9, v10
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %vc = fadd <vscale x 2 x bfloat> %va, %vb
   ret <vscale x 2 x bfloat> %vc
 }
 
 define <vscale x 2 x bfloat> @vfadd_vf_nxv2bf16(<vscale x 2 x bfloat> %va, bfloat %b) {
-; CHECK-LABEL: vfadd_vf_nxv2bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfadd.vf v9, v9, fa5
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv2bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFH-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFH-NEXT:    vfadd.vf v9, v9, fa5
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv2bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vf v9, v9, fa5
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv2bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v9, v9, fa5
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
   %splat = shufflevector <vscale x 2 x bfloat> %head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
   %vc = fadd <vscale x 2 x bfloat> %va, %splat
@@ -77,31 +168,75 @@ define <vscale x 2 x bfloat> @vfadd_vf_nxv2bf16(<vscale x 2 x bfloat> %va, bfloa
 }
 
 define <vscale x 4 x bfloat> @vfadd_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb) {
-; CHECK-LABEL: vfadd_vv_nxv4bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfadd.vv v10, v12, v10
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv4bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFH-NEXT:    vfadd.vv v10, v12, v10
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v10
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv4bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v10, v12, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v10
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv4bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v10, v12, v10
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFBFA-NEXT:    ret
   %vc = fadd <vscale x 4 x bfloat> %va, %vb
   ret <vscale x 4 x bfloat> %vc
 }
 
 define <vscale x 4 x bfloat> @vfadd_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloat %b) {
-; CHECK-LABEL: vfadd_vf_nxv4bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfadd.vf v10, v10, fa5
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv4bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFH-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFH-NEXT:    vfadd.vf v10, v10, fa5
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v10
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv4bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vf v10, v10, fa5
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v10
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv4bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v10, v10, fa5
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
   %splat = shufflevector <vscale x 4 x bfloat> %head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
   %vc = fadd <vscale x 4 x bfloat> %va, %splat
@@ -109,31 +244,75 @@ define <vscale x 4 x bfloat> @vfadd_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloa
 }
 
 define <vscale x 8 x bfloat> @vfadd_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) {
-; CHECK-LABEL: vfadd_vv_nxv8bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfadd.vv v12, v16, v12
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv8bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFH-NEXT:    vfadd.vv v12, v16, v12
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v12
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv8bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v12, v16, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v12
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv8bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v12, v16, v12
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFBFA-NEXT:    ret
   %vc = fadd <vscale x 8 x bfloat> %va, %vb
   ret <vscale x 8 x bfloat> %vc
 }
 
 define <vscale x 8 x bfloat> @vfadd_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
-; CHECK-LABEL: vfadd_vf_nxv8bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfadd.vf v12, v12, fa5
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv8bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFH-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFH-NEXT:    vfadd.vf v12, v12, fa5
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v12
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv8bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vf v12, v12, fa5
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v12
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv8bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v12, v12, fa5
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
   %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
   %vc = fadd <vscale x 8 x bfloat> %va, %splat
@@ -141,16 +320,38 @@ define <vscale x 8 x bfloat> @vfadd_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloa
 }
 
 define <vscale x 8 x bfloat> @vfadd_fv_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
-; CHECK-LABEL: vfadd_fv_nxv8bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfadd.vf v12, v12, fa5
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_fv_nxv8bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFH-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFH-NEXT:    vfadd.vf v12, v12, fa5
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v12
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_fv_nxv8bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vf v12, v12, fa5
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v12
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_fv_nxv8bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v12, v12, fa5
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
   %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
   %vc = fadd <vscale x 8 x bfloat> %splat, %va
@@ -158,31 +359,75 @@ define <vscale x 8 x bfloat> @vfadd_fv_nxv8bf16(<vscale x 8 x bfloat> %va, bfloa
 }
 
 define <vscale x 16 x bfloat> @vfadd_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb) {
-; CHECK-LABEL: vfadd_vv_nxv16bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v24, v16
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv16bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v24, v16
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v16
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv16bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v16
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv16bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT:    ret
   %vc = fadd <vscale x 16 x bfloat> %va, %vb
   ret <vscale x 16 x bfloat> %vc
 }
 
 define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bfloat %b) {
-; CHECK-LABEL: vfadd_vf_nxv16bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vf v16, v16, fa5
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv16bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFH-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vf v16, v16, fa5
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v16
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv16bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vf v16, v16, fa5
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v16
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv16bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v16, v16, fa5
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
   %splat = shufflevector <vscale x 16 x bfloat> %head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
   %vc = fadd <vscale x 16 x bfloat> %va, %splat
@@ -190,78 +435,216 @@ define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bf
 }
 
 define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb) {
-; CHECK-LABEL: vfadd_vv_nxv32bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v0, v0, v8
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v16, v24
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv32bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    addi sp, sp, -16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    sub sp, sp, a0
+; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFH-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; ZVFH-NEXT:    addi a0, sp, 16
+; ZVFH-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v0, v8
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; ZVFH-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v0, v0, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v0
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v16, v24
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v12, v16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    add sp, sp, a0
+; ZVFH-NEXT:    .cfi_def_cfa sp, 16
+; ZVFH-NEXT:    addi sp, sp, 16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv32bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v0, v8
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v0, v0, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v12, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv32bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    addi sp, sp, -16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 3
+; ZVFBFA-NEXT:    sub sp, sp, a0
+; ZVFBFA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFBFA-NEXT:    addi a0, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v0, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFBFA-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v0, v0, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v0
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v24
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 3
+; ZVFBFA-NEXT:    add sp, sp, a0
+; ZVFBFA-NEXT:    .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT:    addi sp, sp, 16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT:    ret
   %vc = fadd <vscale x 32 x bfloat> %va, %vb
   ret <vscale x 32 x bfloat> %vc
 }
 
 define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b) {
-; CHECK-LABEL: vfadd_vf_nxv32bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    fmv.x.h a0, fa0
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v0, v8, v0
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v24, v16
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv32bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    addi sp, sp, -16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    sub sp, sp, a0
+; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFH-NEXT:    fmv.x.h a0, fa0
+; ZVFH-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; ZVFH-NEXT:    addi a1, sp, 16
+; ZVFH-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; ZVFH-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFH-NEXT:    vmv.v.x v8, a0
+; ZVFH-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v0, v8
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; ZVFH-NEXT:    addi a0, sp, 16
+; ZVFH-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v0, v8, v0
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v0
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v24, v16
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v12, v16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    add sp, sp, a0
+; ZVFH-NEXT:    .cfi_def_cfa sp, 16
+; ZVFH-NEXT:    addi sp, sp, 16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv32bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v8, a0
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v0, v8
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v0, v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v12, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv32bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    addi sp, sp, -16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 3
+; ZVFBFA-NEXT:    sub sp, sp, a0
+; ZVFBFA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFBFA-NEXT:    fmv.x.h a0, fa0
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFBFA-NEXT:    addi a1, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16alt, m8, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v8, a0
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v0, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFBFA-NEXT:    addi a0, sp, 16
+; ZVFBFA-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v0, v8, v0
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v0
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 3
+; ZVFBFA-NEXT:    add sp, sp, a0
+; ZVFBFA-NEXT:    .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT:    addi sp, sp, 16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
   %splat = shufflevector <vscale x 32 x bfloat> %head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
   %vc = fadd <vscale x 32 x bfloat> %va, %splat
@@ -285,6 +668,12 @@ define <vscale x 1 x half> @vfadd_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv1f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
   %vc = fadd <vscale x 1 x half> %va, %vb
   ret <vscale x 1 x half> %vc
 }
@@ -306,6 +695,12 @@ define <vscale x 1 x half> @vfadd_vf_nxv1f16(<vscale x 1 x half> %va, half %b) {
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v8, v8, fa0
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 1 x half> %head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
   %vc = fadd <vscale x 1 x half> %va, %splat
@@ -329,6 +724,12 @@ define <vscale x 2 x half> @vfadd_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv2f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
   %vc = fadd <vscale x 2 x half> %va, %vb
   ret <vscale x 2 x half> %vc
 }
@@ -350,6 +751,12 @@ define <vscale x 2 x half> @vfadd_vf_nxv2f16(<vscale x 2 x half> %va, half %b) {
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv2f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v8, v8, fa0
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 2 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 2 x half> %head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
   %vc = fadd <vscale x 2 x half> %va, %splat
@@ -373,6 +780,12 @@ define <vscale x 4 x half> @vfadd_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv4f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
   %vc = fadd <vscale x 4 x half> %va, %vb
   ret <vscale x 4 x half> %vc
 }
@@ -394,6 +807,12 @@ define <vscale x 4 x half> @vfadd_vf_nxv4f16(<vscale x 4 x half> %va, half %b) {
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv4f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v8, v8, fa0
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 4 x half> %head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
   %vc = fadd <vscale x 4 x half> %va, %splat
@@ -417,6 +836,12 @@ define <vscale x 8 x half> @vfadd_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv8f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v8, v8, v10
+; ZVFBFA-NEXT:    ret
   %vc = fadd <vscale x 8 x half> %va, %vb
   ret <vscale x 8 x half> %vc
 }
@@ -438,6 +863,12 @@ define <vscale x 8 x half> @vfadd_vf_nxv8f16(<vscale x 8 x half> %va, half %b) {
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv8f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v8, v8, fa0
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
   %vc = fadd <vscale x 8 x half> %va, %splat
@@ -461,6 +892,12 @@ define <vscale x 8 x half> @vfadd_fv_nxv8f16(<vscale x 8 x half> %va, half %b) {
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_fv_nxv8f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v8, v8, fa0
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
   %vc = fadd <vscale x 8 x half> %splat, %va
@@ -484,6 +921,12 @@ define <vscale x 16 x half> @vfadd_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv16f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v8, v8, v12
+; ZVFBFA-NEXT:    ret
   %vc = fadd <vscale x 16 x half> %va, %vb
   ret <vscale x 16 x half> %vc
 }
@@ -505,6 +948,12 @@ define <vscale x 16 x half> @vfadd_vf_nxv16f16(<vscale x 16 x half> %va, half %b
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv16f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v8, v8, fa0
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 16 x half> %head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
   %vc = fadd <vscale x 16 x half> %va, %splat
@@ -549,6 +998,12 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv32f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v8, v8, v16
+; ZVFBFA-NEXT:    ret
   %vc = fadd <vscale x 32 x half> %va, %vb
   ret <vscale x 32 x half> %vc
 }
@@ -596,6 +1051,12 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv32f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v8, v8, fa0
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 32 x half> %head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
   %vc = fadd <vscale x 32 x half> %va, %splat
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
index 32e3d6b..633a201 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
@@ -11,52 +11,125 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
 ; RUN:     -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
 ; RUN:     --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+zvfhmin,+experimental-zvfbfa,+v \
+; RUN:     -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN:     --check-prefixes=CHECK,ZVFBFA
 
 declare <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat>, <vscale x 1 x bfloat>, <vscale x 1 x i1>, i32)
 
 define <vscale x 1 x bfloat> @vfadd_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv1bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfadd.vv v9, v9, v10, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv1bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v9, v8, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFH-NEXT:    vfadd.vv v9, v9, v10, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv1bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv1bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v9, v10, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x bfloat> %v
 }
 
 define <vscale x 1 x bfloat> @vfadd_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv1bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfadd.vv v9, v9, v10
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv1bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFH-NEXT:    vfadd.vv v9, v9, v10
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv1bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv1bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v9, v10
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 1 x bfloat> %v
 }
 
 define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv1bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfadd.vv v9, v10, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv1bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vmv.v.x v9, a1
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v9, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFH-NEXT:    vfadd.vv v9, v10, v8, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv1bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v10, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
   %v = call <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 %evl)
@@ -64,18 +137,44 @@ define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloa
 }
 
 define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16_commute(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv1bf16_commute:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfadd.vv v9, v8, v10, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv1bf16_commute:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vmv.v.x v9, a1
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v9, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFH-NEXT:    vfadd.vv v9, v8, v10, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv1bf16_commute:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v9, v8, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1bf16_commute:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v8, v10, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
   %v = call <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat> %vb, <vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 %evl)
@@ -83,18 +182,44 @@ define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16_commute(<vscale x 1 x bfloat> %v
 }
 
 define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv1bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfadd.vv v9, v10, v8
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv1bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vmv.v.x v9, a1
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v9
+; ZVFH-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFH-NEXT:    vfadd.vv v9, v10, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv1bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v10, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
   %v = call <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -102,18 +227,44 @@ define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16_unmasked(<vscale x 1 x bfloat> %
 }
 
 define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16_unmasked_commute(<vscale x 1 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv1bf16_unmasked_commute:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfadd.vv v9, v8, v10
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv1bf16_unmasked_commute:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vmv.v.x v9, a1
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v9
+; ZVFH-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFH-NEXT:    vfadd.vv v9, v8, v10
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv1bf16_unmasked_commute:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v9, v8, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1bf16_unmasked_commute:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v8, v10
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
   %v = call <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat> %vb, <vscale x 1 x bfloat> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -123,48 +274,118 @@ define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16_unmasked_commute(<vscale x 1 x b
 declare <vscale x 2 x bfloat> @llvm.vp.fadd.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x i1>, i32)
 
 define <vscale x 2 x bfloat> @vfadd_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv2bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfadd.vv v9, v9, v10, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv2bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v9, v8, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFH-NEXT:    vfadd.vv v9, v9, v10, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv2bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv2bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v9, v10, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.fadd.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x bfloat> %v
 }
 
 define <vscale x 2 x bfloat> @vfadd_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv2bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfadd.vv v9, v9, v10
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv2bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFH-NEXT:    vfadd.vv v9, v9, v10
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv2bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv2bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v9, v10
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.fadd.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x bfloat> %v
 }
 
 define <vscale x 2 x bfloat> @vfadd_vf_nxv2bf16(<vscale x 2 x bfloat> %va, bfloat %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv2bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfadd.vv v9, v10, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv2bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT:    vmv.v.x v9, a1
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v9, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFH-NEXT:    vfadd.vv v9, v10, v8, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv2bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv2bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v10, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 2 x bfloat> %elt.head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
   %v = call <vscale x 2 x bfloat> @llvm.vp.fadd.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> %m, i32 %evl)
@@ -172,18 +393,44 @@ define <vscale x 2 x bfloat> @vfadd_vf_nxv2bf16(<vscale x 2 x bfloat> %va, bfloa
 }
 
 define <vscale x 2 x bfloat> @vfadd_vf_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv2bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfadd.vv v9, v10, v8
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv2bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT:    vmv.v.x v9, a1
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v9
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFH-NEXT:    vfadd.vv v9, v10, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv2bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv2bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v10, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 2 x bfloat> %elt.head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
   %v = call <vscale x 2 x bfloat> @llvm.vp.fadd.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -193,48 +440,118 @@ define <vscale x 2 x bfloat> @vfadd_vf_nxv2bf16_unmasked(<vscale x 2 x bfloat> %
 declare <vscale x 4 x bfloat> @llvm.vp.fadd.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x i1>, i32)
 
 define <vscale x 4 x bfloat> @vfadd_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv4bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfadd.vv v10, v12, v10, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv4bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFH-NEXT:    vfadd.vv v10, v12, v10, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv4bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v10, v12, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv4bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v10, v12, v10, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v10, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.fadd.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x bfloat> %v
 }
 
 define <vscale x 4 x bfloat> @vfadd_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv4bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfadd.vv v10, v12, v10
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv4bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFH-NEXT:    vfadd.vv v10, v12, v10
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v10
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv4bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v10, v12, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v10
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv4bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v10, v12, v10
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.fadd.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 4 x bfloat> %v
 }
 
 define <vscale x 4 x bfloat> @vfadd_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloat %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv4bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v12, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfadd.vv v10, v10, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv4bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT:    vmv.v.x v12, a1
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v12, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFH-NEXT:    vfadd.vv v10, v10, v8, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv4bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v10, v10, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv4bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v12, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v10, v10, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v10, v0.t
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 4 x bfloat> %elt.head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
   %v = call <vscale x 4 x bfloat> @llvm.vp.fadd.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> %m, i32 %evl)
@@ -242,18 +559,44 @@ define <vscale x 4 x bfloat> @vfadd_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloa
 }
 
 define <vscale x 4 x bfloat> @vfadd_vf_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv4bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v12
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfadd.vv v10, v10, v8
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv4bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT:    vmv.v.x v12, a1
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v12
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFH-NEXT:    vfadd.vv v10, v10, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v10
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv4bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v10, v10, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v10
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv4bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v12, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v12
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v10, v10, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 4 x bfloat> %elt.head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
   %v = call <vscale x 4 x bfloat> @llvm.vp.fadd.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -263,48 +606,118 @@ define <vscale x 4 x bfloat> @vfadd_vf_nxv4bf16_unmasked(<vscale x 4 x bfloat> %
 declare <vscale x 8 x bfloat> @llvm.vp.fadd.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x i1>, i32)
 
 define <vscale x 8 x bfloat> @vfadd_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv8bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfadd.vv v12, v16, v12, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv8bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v12, v10, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFH-NEXT:    vfadd.vv v12, v16, v12, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv8bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v12, v16, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv8bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v12, v16, v12, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v12, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.fadd.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x bfloat> %v
 }
 
 define <vscale x 8 x bfloat> @vfadd_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv8bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfadd.vv v12, v16, v12
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv8bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFH-NEXT:    vfadd.vv v12, v16, v12
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v12
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv8bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v12, v16, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v12
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv8bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v12, v16, v12
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.fadd.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 8 x bfloat> %v
 }
 
 define <vscale x 8 x bfloat> @vfadd_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv8bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfadd.vv v12, v12, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv8bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT:    vmv.v.x v16, a1
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v16, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFH-NEXT:    vfadd.vv v12, v12, v8, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv8bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v12, v12, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv8bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v16, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v12, v12, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v12, v0.t
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
   %v = call <vscale x 8 x bfloat> @llvm.vp.fadd.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 %evl)
@@ -312,18 +725,44 @@ define <vscale x 8 x bfloat> @vfadd_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloa
 }
 
 define <vscale x 8 x bfloat> @vfadd_vf_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv8bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfadd.vv v12, v12, v8
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv8bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT:    vmv.v.x v16, a1
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v16
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFH-NEXT:    vfadd.vv v12, v12, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v12
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv8bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v12, v12, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v12
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv8bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v16, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v12, v12, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
   %v = call <vscale x 8 x bfloat> @llvm.vp.fadd.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -333,48 +772,118 @@ define <vscale x 8 x bfloat> @vfadd_vf_nxv8bf16_unmasked(<vscale x 8 x bfloat> %
 declare <vscale x 16 x bfloat> @llvm.vp.fadd.nxv16bf16(<vscale x 16 x bfloat>, <vscale x 16 x bfloat>, <vscale x 16 x i1>, i32)
 
 define <vscale x 16 x bfloat> @vfadd_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv16bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv16bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v8, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv16bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv16bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.vp.fadd.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x bfloat> %v
 }
 
 define <vscale x 16 x bfloat> @vfadd_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv16bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v24, v16
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv16bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v24, v16
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v16
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv16bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v16
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv16bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.vp.fadd.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 16 x bfloat> %v
 }
 
 define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bfloat %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv16bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v24, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v24, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v16, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv16bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT:    vmv.v.x v24, a1
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v24, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v16, v8, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv16bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv16bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v24, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 16 x bfloat> %elt.head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
   %v = call <vscale x 16 x bfloat> @llvm.vp.fadd.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> %m, i32 %evl)
@@ -382,18 +891,44 @@ define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bf
 }
 
 define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv16bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v24, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v24
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v16, v8
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv16bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT:    vmv.v.x v24, a1
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v24
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v16, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v16
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv16bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v24
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v16
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv16bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v24, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v24
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 16 x bfloat> %elt.head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
   %v = call <vscale x 16 x bfloat> @llvm.vp.fadd.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -403,173 +938,493 @@ define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16_unmasked(<vscale x 16 x bfloat
 declare <vscale x 32 x bfloat> @llvm.vp.fadd.nxv32bf16(<vscale x 32 x bfloat>, <vscale x 32 x bfloat>, <vscale x 32 x i1>, i32)
 
 define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv32bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv1r.v v7, v0
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
-; CHECK-NEXT:    bltu a0, a1, .LBB22_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:  .LBB22_2:
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv32bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    addi sp, sp, -16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 16
+; ZVFH-NEXT:    csrr a1, vlenb
+; ZVFH-NEXT:    slli a1, a1, 3
+; ZVFH-NEXT:    sub sp, sp, a1
+; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFH-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; ZVFH-NEXT:    vmv1r.v v7, v0
+; ZVFH-NEXT:    csrr a2, vlenb
+; ZVFH-NEXT:    slli a1, a2, 1
+; ZVFH-NEXT:    srli a2, a2, 2
+; ZVFH-NEXT:    sub a3, a0, a1
+; ZVFH-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFH-NEXT:    sltu a2, a0, a3
+; ZVFH-NEXT:    addi a2, a2, -1
+; ZVFH-NEXT:    and a2, a2, a3
+; ZVFH-NEXT:    addi a3, sp, 16
+; ZVFH-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFH-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
+; ZVFH-NEXT:    bltu a0, a1, .LBB22_2
+; ZVFH-NEXT:  # %bb.1:
+; ZVFH-NEXT:    mv a0, a1
+; ZVFH-NEXT:  .LBB22_2:
+; ZVFH-NEXT:    vmv1r.v v0, v7
+; ZVFH-NEXT:    addi a1, sp, 16
+; ZVFH-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v24, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v8, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    add sp, sp, a0
+; ZVFH-NEXT:    .cfi_def_cfa sp, 16
+; ZVFH-NEXT:    addi sp, sp, 16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv32bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a1, a2, 1
+; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB22_2
+; ZVFHMIN-NEXT:  # %bb.1:
+; ZVFHMIN-NEXT:    mv a0, a1
+; ZVFHMIN-NEXT:  .LBB22_2:
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv32bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    addi sp, sp, -16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT:    csrr a1, vlenb
+; ZVFBFA-NEXT:    slli a1, a1, 3
+; ZVFBFA-NEXT:    sub sp, sp, a1
+; ZVFBFA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFBFA-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT:    vmv1r.v v7, v0
+; ZVFBFA-NEXT:    csrr a2, vlenb
+; ZVFBFA-NEXT:    slli a1, a2, 1
+; ZVFBFA-NEXT:    srli a2, a2, 2
+; ZVFBFA-NEXT:    sub a3, a0, a1
+; ZVFBFA-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFBFA-NEXT:    sltu a2, a0, a3
+; ZVFBFA-NEXT:    addi a2, a2, -1
+; ZVFBFA-NEXT:    and a2, a2, a3
+; ZVFBFA-NEXT:    addi a3, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a2, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v12, v16, v0.t
+; ZVFBFA-NEXT:    bltu a0, a1, .LBB22_2
+; ZVFBFA-NEXT:  # %bb.1:
+; ZVFBFA-NEXT:    mv a0, a1
+; ZVFBFA-NEXT:  .LBB22_2:
+; ZVFBFA-NEXT:    vmv1r.v v0, v7
+; ZVFBFA-NEXT:    addi a1, sp, 16
+; ZVFBFA-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v24, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 3
+; ZVFBFA-NEXT:    add sp, sp, a0
+; ZVFBFA-NEXT:    .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT:    addi sp, sp, 16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.vp.fadd.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %b, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x bfloat> %v
 }
 
 define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv32bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v24
-; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
-; CHECK-NEXT:    bltu a0, a1, .LBB23_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:  .LBB23_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v24, v16
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv32bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    addi sp, sp, -16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 16
+; ZVFH-NEXT:    csrr a1, vlenb
+; ZVFH-NEXT:    slli a1, a1, 3
+; ZVFH-NEXT:    sub sp, sp, a1
+; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFH-NEXT:    csrr a2, vlenb
+; ZVFH-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFH-NEXT:    vmset.m v24
+; ZVFH-NEXT:    slli a1, a2, 1
+; ZVFH-NEXT:    srli a2, a2, 2
+; ZVFH-NEXT:    sub a3, a0, a1
+; ZVFH-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFH-NEXT:    vslidedown.vx v0, v24, a2
+; ZVFH-NEXT:    sltu a2, a0, a3
+; ZVFH-NEXT:    addi a2, a2, -1
+; ZVFH-NEXT:    and a2, a2, a3
+; ZVFH-NEXT:    addi a3, sp, 16
+; ZVFH-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFH-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
+; ZVFH-NEXT:    bltu a0, a1, .LBB23_2
+; ZVFH-NEXT:  # %bb.1:
+; ZVFH-NEXT:    mv a0, a1
+; ZVFH-NEXT:  .LBB23_2:
+; ZVFH-NEXT:    addi a1, sp, 16
+; ZVFH-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v24, v16
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    add sp, sp, a0
+; ZVFH-NEXT:    .cfi_def_cfa sp, 16
+; ZVFH-NEXT:    addi sp, sp, 16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv32bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    slli a1, a2, 1
+; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB23_2
+; ZVFHMIN-NEXT:  # %bb.1:
+; ZVFHMIN-NEXT:    mv a0, a1
+; ZVFHMIN-NEXT:  .LBB23_2:
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv32bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    addi sp, sp, -16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT:    csrr a1, vlenb
+; ZVFBFA-NEXT:    slli a1, a1, 3
+; ZVFBFA-NEXT:    sub sp, sp, a1
+; ZVFBFA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFBFA-NEXT:    csrr a2, vlenb
+; ZVFBFA-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFBFA-NEXT:    vmset.m v24
+; ZVFBFA-NEXT:    slli a1, a2, 1
+; ZVFBFA-NEXT:    srli a2, a2, 2
+; ZVFBFA-NEXT:    sub a3, a0, a1
+; ZVFBFA-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vx v0, v24, a2
+; ZVFBFA-NEXT:    sltu a2, a0, a3
+; ZVFBFA-NEXT:    addi a2, a2, -1
+; ZVFBFA-NEXT:    and a2, a2, a3
+; ZVFBFA-NEXT:    addi a3, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a2, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v12, v16, v0.t
+; ZVFBFA-NEXT:    bltu a0, a1, .LBB23_2
+; ZVFBFA-NEXT:  # %bb.1:
+; ZVFBFA-NEXT:    mv a0, a1
+; ZVFBFA-NEXT:  .LBB23_2:
+; ZVFBFA-NEXT:    addi a1, sp, 16
+; ZVFBFA-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 3
+; ZVFBFA-NEXT:    add sp, sp, a0
+; ZVFBFA-NEXT:    .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT:    addi sp, sp, 16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.vp.fadd.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x bfloat> %v
 }
 
 define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv32bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v7, v0
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vmv.v.x v24, a1
-; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
-; CHECK-NEXT:    bltu a0, a1, .LBB24_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:  .LBB24_2:
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv32bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    addi sp, sp, -16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 16
+; ZVFH-NEXT:    csrr a1, vlenb
+; ZVFH-NEXT:    slli a1, a1, 4
+; ZVFH-NEXT:    sub sp, sp, a1
+; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; ZVFH-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFH-NEXT:    vmv1r.v v7, v0
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    csrr a2, vlenb
+; ZVFH-NEXT:    vmv.v.x v24, a1
+; ZVFH-NEXT:    slli a1, a2, 1
+; ZVFH-NEXT:    srli a2, a2, 2
+; ZVFH-NEXT:    sub a3, a0, a1
+; ZVFH-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFH-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFH-NEXT:    sltu a2, a0, a3
+; ZVFH-NEXT:    addi a2, a2, -1
+; ZVFH-NEXT:    and a2, a2, a3
+; ZVFH-NEXT:    csrr a3, vlenb
+; ZVFH-NEXT:    slli a3, a3, 3
+; ZVFH-NEXT:    add a3, sp, a3
+; ZVFH-NEXT:    addi a3, a3, 16
+; ZVFH-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
+; ZVFH-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v28, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
+; ZVFH-NEXT:    bltu a0, a1, .LBB24_2
+; ZVFH-NEXT:  # %bb.1:
+; ZVFH-NEXT:    mv a0, a1
+; ZVFH-NEXT:  .LBB24_2:
+; ZVFH-NEXT:    vmv1r.v v0, v7
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
+; ZVFH-NEXT:    addi a0, sp, 16
+; ZVFH-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    add a0, sp, a0
+; ZVFH-NEXT:    addi a0, a0, 16
+; ZVFH-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
+; ZVFH-NEXT:    addi a0, sp, 16
+; ZVFH-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 4
+; ZVFH-NEXT:    add sp, sp, a0
+; ZVFH-NEXT:    .cfi_def_cfa sp, 16
+; ZVFH-NEXT:    addi sp, sp, 16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv32bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
+; ZVFHMIN-NEXT:    slli a1, a2, 1
+; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v28, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB24_2
+; ZVFHMIN-NEXT:  # %bb.1:
+; ZVFHMIN-NEXT:    mv a0, a1
+; ZVFHMIN-NEXT:  .LBB24_2:
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv32bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    addi sp, sp, -16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT:    csrr a1, vlenb
+; ZVFBFA-NEXT:    slli a1, a1, 4
+; ZVFBFA-NEXT:    sub sp, sp, a1
+; ZVFBFA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFBFA-NEXT:    vmv1r.v v7, v0
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    csrr a2, vlenb
+; ZVFBFA-NEXT:    vmv.v.x v24, a1
+; ZVFBFA-NEXT:    slli a1, a2, 1
+; ZVFBFA-NEXT:    srli a2, a2, 2
+; ZVFBFA-NEXT:    sub a3, a0, a1
+; ZVFBFA-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFBFA-NEXT:    sltu a2, a0, a3
+; ZVFBFA-NEXT:    addi a2, a2, -1
+; ZVFBFA-NEXT:    and a2, a2, a3
+; ZVFBFA-NEXT:    csrr a3, vlenb
+; ZVFBFA-NEXT:    slli a3, a3, 3
+; ZVFBFA-NEXT:    add a3, sp, a3
+; ZVFBFA-NEXT:    addi a3, a3, 16
+; ZVFBFA-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a2, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v12, v16, v0.t
+; ZVFBFA-NEXT:    bltu a0, a1, .LBB24_2
+; ZVFBFA-NEXT:  # %bb.1:
+; ZVFBFA-NEXT:    mv a0, a1
+; ZVFBFA-NEXT:  .LBB24_2:
+; ZVFBFA-NEXT:    vmv1r.v v0, v7
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFBFA-NEXT:    addi a0, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 3
+; ZVFBFA-NEXT:    add a0, sp, a0
+; ZVFBFA-NEXT:    addi a0, a0, 16
+; ZVFBFA-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFBFA-NEXT:    addi a0, sp, 16
+; ZVFBFA-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 4
+; ZVFBFA-NEXT:    add sp, sp, a0
+; ZVFBFA-NEXT:    .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT:    addi sp, sp, 16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x bfloat> @llvm.vp.fadd.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> %m, i32 %evl)
@@ -577,56 +1432,158 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 }
 
 define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv32bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmset.m v24
-; CHECK-NEXT:    vmv.v.x v16, a1
-; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
-; CHECK-NEXT:    bltu a0, a1, .LBB25_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:  .LBB25_2:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v16, v24
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv32bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    addi sp, sp, -16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 16
+; ZVFH-NEXT:    csrr a1, vlenb
+; ZVFH-NEXT:    slli a1, a1, 3
+; ZVFH-NEXT:    sub sp, sp, a1
+; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    csrr a2, vlenb
+; ZVFH-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFH-NEXT:    vmset.m v24
+; ZVFH-NEXT:    vmv.v.x v16, a1
+; ZVFH-NEXT:    slli a1, a2, 1
+; ZVFH-NEXT:    srli a2, a2, 2
+; ZVFH-NEXT:    sub a3, a0, a1
+; ZVFH-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFH-NEXT:    vslidedown.vx v0, v24, a2
+; ZVFH-NEXT:    sltu a2, a0, a3
+; ZVFH-NEXT:    addi a2, a2, -1
+; ZVFH-NEXT:    and a2, a2, a3
+; ZVFH-NEXT:    addi a3, sp, 16
+; ZVFH-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFH-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
+; ZVFH-NEXT:    bltu a0, a1, .LBB25_2
+; ZVFH-NEXT:  # %bb.1:
+; ZVFH-NEXT:    mv a0, a1
+; ZVFH-NEXT:  .LBB25_2:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; ZVFH-NEXT:    addi a0, sp, 16
+; ZVFH-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v0
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v16, v24
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    add sp, sp, a0
+; ZVFH-NEXT:    .cfi_def_cfa sp, 16
+; ZVFH-NEXT:    addi sp, sp, 16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv32bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
+; ZVFHMIN-NEXT:    slli a1, a2, 1
+; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB25_2
+; ZVFHMIN-NEXT:  # %bb.1:
+; ZVFHMIN-NEXT:    mv a0, a1
+; ZVFHMIN-NEXT:  .LBB25_2:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv32bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    addi sp, sp, -16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT:    csrr a1, vlenb
+; ZVFBFA-NEXT:    slli a1, a1, 3
+; ZVFBFA-NEXT:    sub sp, sp, a1
+; ZVFBFA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    csrr a2, vlenb
+; ZVFBFA-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFBFA-NEXT:    vmset.m v24
+; ZVFBFA-NEXT:    vmv.v.x v16, a1
+; ZVFBFA-NEXT:    slli a1, a2, 1
+; ZVFBFA-NEXT:    srli a2, a2, 2
+; ZVFBFA-NEXT:    sub a3, a0, a1
+; ZVFBFA-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vx v0, v24, a2
+; ZVFBFA-NEXT:    sltu a2, a0, a3
+; ZVFBFA-NEXT:    addi a2, a2, -1
+; ZVFBFA-NEXT:    and a2, a2, a3
+; ZVFBFA-NEXT:    addi a3, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a2, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v12, v16, v0.t
+; ZVFBFA-NEXT:    bltu a0, a1, .LBB25_2
+; ZVFBFA-NEXT:  # %bb.1:
+; ZVFBFA-NEXT:    mv a0, a1
+; ZVFBFA-NEXT:  .LBB25_2:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFBFA-NEXT:    addi a0, sp, 16
+; ZVFBFA-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v0
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v24
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 3
+; ZVFBFA-NEXT:    add sp, sp, a0
+; ZVFBFA-NEXT:    .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT:    addi sp, sp, 16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x bfloat> @llvm.vp.fadd.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
@@ -651,6 +1608,17 @@ define <vscale x 1 x half> @vfadd_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv1f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v9, v10, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.fadd.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %b, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x half> %v
 }
@@ -672,6 +1640,17 @@ define <vscale x 1 x half> @vfadd_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv1f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v9, v10
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.fadd.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %b, <vscale x 1 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 1 x half> %v
 }
@@ -695,6 +1674,19 @@ define <vscale x 1 x half> @vfadd_vf_nxv1f16(<vscale x 1 x half> %va, half %b, <
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v10, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
   %v = call <vscale x 1 x half> @llvm.vp.fadd.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, <vscale x 1 x i1> %m, i32 %evl)
@@ -720,6 +1712,19 @@ define <vscale x 1 x half> @vfadd_vf_nxv1f16_commute(<vscale x 1 x half> %va, ha
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1f16_commute:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v8, v10, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
   %v = call <vscale x 1 x half> @llvm.vp.fadd.nxv1f16(<vscale x 1 x half> %vb, <vscale x 1 x half> %va, <vscale x 1 x i1> %m, i32 %evl)
@@ -745,6 +1750,19 @@ define <vscale x 1 x half> @vfadd_vf_nxv1f16_unmasked(<vscale x 1 x half> %va, h
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v10, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
   %v = call <vscale x 1 x half> @llvm.vp.fadd.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -770,6 +1788,19 @@ define <vscale x 1 x half> @vfadd_vf_nxv1f16_unmasked_commute(<vscale x 1 x half
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1f16_unmasked_commute:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v8, v10
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
   %v = call <vscale x 1 x half> @llvm.vp.fadd.nxv1f16(<vscale x 1 x half> %vb, <vscale x 1 x half> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -795,6 +1826,17 @@ define <vscale x 2 x half> @vfadd_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv2f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v9, v10, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.fadd.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %b, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x half> %v
 }
@@ -816,6 +1858,17 @@ define <vscale x 2 x half> @vfadd_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv2f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v9, v10
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.fadd.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %b, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x half> %v
 }
@@ -839,6 +1892,19 @@ define <vscale x 2 x half> @vfadd_vf_nxv2f16(<vscale x 2 x half> %va, half %b, <
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv2f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v10, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 2 x half> %elt.head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
   %v = call <vscale x 2 x half> @llvm.vp.fadd.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vb, <vscale x 2 x i1> %m, i32 %evl)
@@ -864,6 +1930,19 @@ define <vscale x 2 x half> @vfadd_vf_nxv2f16_unmasked(<vscale x 2 x half> %va, h
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv2f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v10, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 2 x half> %elt.head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
   %v = call <vscale x 2 x half> @llvm.vp.fadd.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vb, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -889,6 +1968,17 @@ define <vscale x 4 x half> @vfadd_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv4f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v10, v12, v10, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v10, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.fadd.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %b, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x half> %v
 }
@@ -910,6 +2000,17 @@ define <vscale x 4 x half> @vfadd_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv4f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v10, v12, v10
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.fadd.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %b, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 4 x half> %v
 }
@@ -933,6 +2034,19 @@ define <vscale x 4 x half> @vfadd_vf_nxv4f16(<vscale x 4 x half> %va, half %b, <
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv4f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v12, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v10, v10, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v10, v0.t
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
   %v = call <vscale x 4 x half> @llvm.vp.fadd.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %vb, <vscale x 4 x i1> %m, i32 %evl)
@@ -958,6 +2072,19 @@ define <vscale x 4 x half> @vfadd_vf_nxv4f16_unmasked(<vscale x 4 x half> %va, h
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv4f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v12, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v12
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v10, v10, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
   %v = call <vscale x 4 x half> @llvm.vp.fadd.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %vb, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -983,6 +2110,17 @@ define <vscale x 8 x half> @vfadd_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv8f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v12, v16, v12, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v12, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %b, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x half> %v
 }
@@ -1004,6 +2142,17 @@ define <vscale x 8 x half> @vfadd_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv8f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v12, v16, v12
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %b, <vscale x 8 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 8 x half> %v
 }
@@ -1027,6 +2176,19 @@ define <vscale x 8 x half> @vfadd_vf_nxv8f16(<vscale x 8 x half> %va, half %b, <
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv8f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v16, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v12, v12, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v12, v0.t
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
   %v = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %vb, <vscale x 8 x i1> %m, i32 %evl)
@@ -1052,6 +2214,19 @@ define <vscale x 8 x half> @vfadd_vf_nxv8f16_unmasked(<vscale x 8 x half> %va, h
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv8f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v16, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v12, v12, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
   %v = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %vb, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -1077,6 +2252,17 @@ define <vscale x 16 x half> @vfadd_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv16f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.fadd.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %b, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x half> %v
 }
@@ -1098,6 +2284,17 @@ define <vscale x 16 x half> @vfadd_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv16f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.fadd.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %b, <vscale x 16 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 16 x half> %v
 }
@@ -1121,6 +2318,19 @@ define <vscale x 16 x half> @vfadd_vf_nxv16f16(<vscale x 16 x half> %va, half %b
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv16f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v24, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
   %v = call <vscale x 16 x half> @llvm.vp.fadd.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, <vscale x 16 x i1> %m, i32 %evl)
@@ -1146,6 +2356,19 @@ define <vscale x 16 x half> @vfadd_vf_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv16f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v24, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v24
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
   %v = call <vscale x 16 x half> @llvm.vp.fadd.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -1209,6 +2432,55 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv32f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    addi sp, sp, -16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT:    csrr a1, vlenb
+; ZVFBFA-NEXT:    slli a1, a1, 3
+; ZVFBFA-NEXT:    sub sp, sp, a1
+; ZVFBFA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFBFA-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT:    vmv1r.v v7, v0
+; ZVFBFA-NEXT:    csrr a2, vlenb
+; ZVFBFA-NEXT:    slli a1, a2, 1
+; ZVFBFA-NEXT:    srli a2, a2, 2
+; ZVFBFA-NEXT:    sub a3, a0, a1
+; ZVFBFA-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFBFA-NEXT:    sltu a2, a0, a3
+; ZVFBFA-NEXT:    addi a2, a2, -1
+; ZVFBFA-NEXT:    and a2, a2, a3
+; ZVFBFA-NEXT:    addi a3, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v12, v16, v0.t
+; ZVFBFA-NEXT:    bltu a0, a1, .LBB48_2
+; ZVFBFA-NEXT:  # %bb.1:
+; ZVFBFA-NEXT:    mv a0, a1
+; ZVFBFA-NEXT:  .LBB48_2:
+; ZVFBFA-NEXT:    vmv1r.v v0, v7
+; ZVFBFA-NEXT:    addi a1, sp, 16
+; ZVFBFA-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v24, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 3
+; ZVFBFA-NEXT:    add sp, sp, a0
+; ZVFBFA-NEXT:    .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT:    addi sp, sp, 16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.fadd.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x half> %v
 }
@@ -1268,6 +2540,55 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv32f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    addi sp, sp, -16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT:    csrr a1, vlenb
+; ZVFBFA-NEXT:    slli a1, a1, 3
+; ZVFBFA-NEXT:    sub sp, sp, a1
+; ZVFBFA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFBFA-NEXT:    csrr a2, vlenb
+; ZVFBFA-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFBFA-NEXT:    vmset.m v24
+; ZVFBFA-NEXT:    slli a1, a2, 1
+; ZVFBFA-NEXT:    srli a2, a2, 2
+; ZVFBFA-NEXT:    sub a3, a0, a1
+; ZVFBFA-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vx v0, v24, a2
+; ZVFBFA-NEXT:    sltu a2, a0, a3
+; ZVFBFA-NEXT:    addi a2, a2, -1
+; ZVFBFA-NEXT:    and a2, a2, a3
+; ZVFBFA-NEXT:    addi a3, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v12, v16, v0.t
+; ZVFBFA-NEXT:    bltu a0, a1, .LBB49_2
+; ZVFBFA-NEXT:  # %bb.1:
+; ZVFBFA-NEXT:    mv a0, a1
+; ZVFBFA-NEXT:  .LBB49_2:
+; ZVFBFA-NEXT:    addi a1, sp, 16
+; ZVFBFA-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 3
+; ZVFBFA-NEXT:    add sp, sp, a0
+; ZVFBFA-NEXT:    .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT:    addi sp, sp, 16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.fadd.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x half> %v
 }
@@ -1340,6 +2661,68 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv32f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    addi sp, sp, -16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT:    csrr a1, vlenb
+; ZVFBFA-NEXT:    slli a1, a1, 4
+; ZVFBFA-NEXT:    sub sp, sp, a1
+; ZVFBFA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFBFA-NEXT:    vmv1r.v v7, v0
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    csrr a2, vlenb
+; ZVFBFA-NEXT:    vmv.v.x v24, a1
+; ZVFBFA-NEXT:    slli a1, a2, 1
+; ZVFBFA-NEXT:    srli a2, a2, 2
+; ZVFBFA-NEXT:    sub a3, a0, a1
+; ZVFBFA-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFBFA-NEXT:    sltu a2, a0, a3
+; ZVFBFA-NEXT:    addi a2, a2, -1
+; ZVFBFA-NEXT:    and a2, a2, a3
+; ZVFBFA-NEXT:    csrr a3, vlenb
+; ZVFBFA-NEXT:    slli a3, a3, 3
+; ZVFBFA-NEXT:    add a3, sp, a3
+; ZVFBFA-NEXT:    addi a3, a3, 16
+; ZVFBFA-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v12, v16, v0.t
+; ZVFBFA-NEXT:    bltu a0, a1, .LBB50_2
+; ZVFBFA-NEXT:  # %bb.1:
+; ZVFBFA-NEXT:    mv a0, a1
+; ZVFBFA-NEXT:  .LBB50_2:
+; ZVFBFA-NEXT:    vmv1r.v v0, v7
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFBFA-NEXT:    addi a0, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 3
+; ZVFBFA-NEXT:    add a0, sp, a0
+; ZVFBFA-NEXT:    addi a0, a0, 16
+; ZVFBFA-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFBFA-NEXT:    addi a0, sp, 16
+; ZVFBFA-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 4
+; ZVFBFA-NEXT:    add sp, sp, a0
+; ZVFBFA-NEXT:    .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT:    addi sp, sp, 16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x half> @llvm.vp.fadd.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
@@ -1403,6 +2786,57 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv32f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    addi sp, sp, -16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT:    csrr a1, vlenb
+; ZVFBFA-NEXT:    slli a1, a1, 3
+; ZVFBFA-NEXT:    sub sp, sp, a1
+; ZVFBFA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    csrr a2, vlenb
+; ZVFBFA-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFBFA-NEXT:    vmset.m v24
+; ZVFBFA-NEXT:    vmv.v.x v16, a1
+; ZVFBFA-NEXT:    slli a1, a2, 1
+; ZVFBFA-NEXT:    srli a2, a2, 2
+; ZVFBFA-NEXT:    sub a3, a0, a1
+; ZVFBFA-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vx v0, v24, a2
+; ZVFBFA-NEXT:    sltu a2, a0, a3
+; ZVFBFA-NEXT:    addi a2, a2, -1
+; ZVFBFA-NEXT:    and a2, a2, a3
+; ZVFBFA-NEXT:    addi a3, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v12, v16, v0.t
+; ZVFBFA-NEXT:    bltu a0, a1, .LBB51_2
+; ZVFBFA-NEXT:  # %bb.1:
+; ZVFBFA-NEXT:    mv a0, a1
+; ZVFBFA-NEXT:  .LBB51_2:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFBFA-NEXT:    addi a0, sp, 16
+; ZVFBFA-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v0
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v24
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 3
+; ZVFBFA-NEXT:    add sp, sp, a0
+; ZVFBFA-NEXT:    .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT:    addi sp, sp, 16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x half> @llvm.vp.fadd.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll
index bd07ba1..eb4cf76 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll
@@ -20,6 +20,9 @@
 ; CHECK: OpReadClockKHR [[v2uint]] [[uint_1]]
 ; CHECK: OpReadClockKHR [[v2uint]] [[uint_2]]
 ; CHECK: OpReadClockKHR [[v2uint]] [[uint_3]]
+; CHECK: OpReadClockKHR [[ulong]] [[uint_1]]
+; CHECK: OpReadClockKHR [[ulong]] [[uint_2]]
+; CHECK: OpReadClockKHR [[ulong]] [[uint_3]]
 
 define dso_local spir_kernel void @test_clocks(ptr addrspace(1) nocapture noundef writeonly align 8 %out64, ptr addrspace(1) nocapture noundef writeonly align 8 %outv2) {
 entry:
@@ -39,6 +42,9 @@ entry:
   %call9 = tail call spir_func <2 x i32> @_Z25clock_read_hilo_sub_groupv()
   %arrayidx10 = getelementptr inbounds i8, ptr addrspace(1) %outv2, i32 16
   store <2 x i32> %call9, ptr addrspace(1) %arrayidx10, align 8
+  %call10 = call spir_func i64 @_Z27__spirv_ReadClockKHR_Rulongi(i32 1)
+  %call11 = call spir_func i64 @_Z27__spirv_ReadClockKHR_Rulongi(i32 2)
+  %call12 = call spir_func i64 @_Z27__spirv_ReadClockKHR_Rulongi(i32 3)
   ret void
 }
 
@@ -59,3 +65,6 @@ declare spir_func <2 x i32> @_Z26clock_read_hilo_work_groupv() local_unnamed_add
 
 ; Function Attrs: convergent nounwind
 declare spir_func <2 x i32> @_Z25clock_read_hilo_sub_groupv() local_unnamed_addr
+
+; Function Attrs: nounwind
+declare spir_func i64 @_Z27__spirv_ReadClockKHR_Rulongi(i32)
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/cbuffer.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/cbuffer.ll
index 4d32e66..6d41875 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/cbuffer.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/cbuffer.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.6-vulkan1.3-library %s -o - | FileCheck %s
-; Test that uses of cbuffer members inside ConstantExprs are handled correctly.
+; Test that uses of cbuffer members are handled correctly.
 
 ; CHECK-DAG: OpDecorate %[[MyCBuffer:[0-9]+]] DescriptorSet 0
 ; CHECK-DAG: OpDecorate %[[MyCBuffer]] Binding 0
@@ -37,10 +37,8 @@ entry:
 ; CHECK: %[[tmp_ptr:[0-9]+]] = OpAccessChain {{%[0-9]+}} %[[tmp]] %[[uint_0]] %[[uint_0]]
 ; CHECK: %[[v_ptr:.+]] = OpAccessChain %[[_ptr_Uniform_v4float]] %[[tmp]] %[[uint_0]] %[[uint_1]]
 ; CHECK: %[[s_ptr_gep:[0-9]+]] = OpInBoundsAccessChain %[[_ptr_Uniform_float]] %[[tmp_ptr]] %[[uint_0]] %[[uint_1]]
-  %gep = getelementptr inbounds %MyStruct, ptr addrspace(12) @s, i32 0, i32 0, i32 1
-
 ; CHECK: %[[s_val:.+]] = OpLoad %[[float]] %[[s_ptr_gep]]
-  %load_from_gep = load float, ptr addrspace(12) %gep, align 4
+  %load_from_gep = load float, ptr addrspace(12) getelementptr inbounds (%MyStruct, ptr addrspace(12) @s, i32 0, i32 0, i32 1), align 4
 
 ; CHECK: %[[v_val:.+]] = OpLoad %[[v4float]] %[[v_ptr]]
   %load_v = load <4 x float>, ptr addrspace(12) @v, align 16
diff --git a/llvm/test/CodeGen/Thumb/PR17309.ll b/llvm/test/CodeGen/Thumb/PR17309.ll
index b548499..4da25ca 100644
--- a/llvm/test/CodeGen/Thumb/PR17309.ll
+++ b/llvm/test/CodeGen/Thumb/PR17309.ll
@@ -48,7 +48,7 @@ declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #1
 declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #1
 
 
-attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind }
-attributes #2 = { optsize "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { optsize "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #3 = { nounwind optsize }
diff --git a/llvm/test/CodeGen/Thumb/fastcc.ll b/llvm/test/CodeGen/Thumb/fastcc.ll
index be356d8..000e20a 100644
--- a/llvm/test/CodeGen/Thumb/fastcc.ll
+++ b/llvm/test/CodeGen/Thumb/fastcc.ll
@@ -29,7 +29,7 @@ for.body193:                                      ; preds = %for.body193, %for.e
   br label %for.body193
 }
 
-attributes #0 = { optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/CodeGen/Thumb/ldm-merge-call.ll b/llvm/test/CodeGen/Thumb/ldm-merge-call.ll
index 700b207..33c4346 100644
--- a/llvm/test/CodeGen/Thumb/ldm-merge-call.ll
+++ b/llvm/test/CodeGen/Thumb/ldm-merge-call.ll
@@ -19,6 +19,6 @@ entry:
 ; Function Attrs: optsize
 declare i32 @bar(i32, i32, i32, i32) #1
 
-attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #2 = { nounwind optsize }
diff --git a/llvm/test/CodeGen/Thumb/stack_guard_remat.ll b/llvm/test/CodeGen/Thumb/stack_guard_remat.ll
index cc14239..82314be 100644
--- a/llvm/test/CodeGen/Thumb/stack_guard_remat.ll
+++ b/llvm/test/CodeGen/Thumb/stack_guard_remat.ll
@@ -50,4 +50,4 @@ declare void @foo3(ptr)
 ; Function Attrs: nounwind
 declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
 
-attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Thumb/stm-merge.ll b/llvm/test/CodeGen/Thumb/stm-merge.ll
index 837c2f6..426210a 100644
--- a/llvm/test/CodeGen/Thumb/stm-merge.ll
+++ b/llvm/test/CodeGen/Thumb/stm-merge.ll
@@ -38,4 +38,4 @@ for.end8:                                         ; preds = %for.body5
   ret void
 }
 
-attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-block-debug.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-block-debug.mir
index 4e2a275..00f0a1c 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-block-debug.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-block-debug.mir
@@ -118,7 +118,7 @@
   declare i32 @llvm.start.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
 
-  attributes #0 = { nofree nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+cdecp0,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" "unsafe-fp-math"="true" }
+  attributes #0 = { nofree nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+cdecp0,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" }
 
   !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, retainedTypes: !2, splitDebugInlining: false, nameTableKind: None)
   !1 = !DIFile(filename: "tmp.c", directory: "")
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-1-pred.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-1-pred.mir
index 4e817ba..e48a038 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-1-pred.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-1-pred.mir
@@ -13,7 +13,7 @@
     ret <4 x float> %inactive1
   }
 
-  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "use-soft-float"="false" }
   attributes #1 = { nounwind readnone }
   attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-2-preds.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-2-preds.mir
index 6b5cbce..b8657c2 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-2-preds.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-2-preds.mir
@@ -16,7 +16,7 @@
 
   declare <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float>, <4 x float>, <4 x float>, i32) #1
 
-  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "use-soft-float"="false" }
   attributes #1 = { nounwind readnone }
   attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-ctrl-flow.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-ctrl-flow.mir
index 91ccf3b..68a38a4 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-ctrl-flow.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-ctrl-flow.mir
@@ -19,7 +19,7 @@
 
   declare <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float>, <4 x float>, <4 x float>, i32) #1
 
-  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "use-soft-float"="false" }
   attributes #1 = { nounwind readnone }
   attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-non-consecutive-ins.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-non-consecutive-ins.mir
index 1f19ed9..caa7b17 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-non-consecutive-ins.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-non-consecutive-ins.mir
@@ -17,7 +17,7 @@
 
   declare <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float>, <4 x float>, <4 x float>, i32) #1
 
-  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "use-soft-float"="false" }
   attributes #1 = { nounwind readnone }
   attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks.mir
index 4f75b01..2f07485 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks.mir
@@ -18,7 +18,7 @@
 
   declare <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float>, <4 x float>, <4 x float>, i32) #1
 
-  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "use-soft-float"="false" }
   attributes #1 = { nounwind readnone }
   attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-3-blocks-kill-vpr.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-3-blocks-kill-vpr.mir
index c268388..f6b64a0 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-3-blocks-kill-vpr.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-3-blocks-kill-vpr.mir
@@ -17,7 +17,7 @@
 
   declare <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float>, <4 x float>, <4 x float>, i32) #1
 
-  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "use-soft-float"="false" }
   attributes #1 = { nounwind readnone }
   attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-block-1-ins.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-block-1-ins.mir
index 1e9e0e3..d086566 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-block-1-ins.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-block-1-ins.mir
@@ -14,7 +14,7 @@
 
   declare <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float>, <4 x float>, <4 x float>, i32) #1
 
-  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "use-soft-float"="false" }
   attributes #1 = { nounwind readnone }
   attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-block-2-ins.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-block-2-ins.mir
index cb73cdf..5436882 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-block-2-ins.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-block-2-ins.mir
@@ -15,7 +15,7 @@
 
   declare <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float>, <4 x float>, <4 x float>, i32) #1
 
-  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "use-soft-float"="false" }
   attributes #1 = { nounwind readnone }
   attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-block-4-ins.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-block-4-ins.mir
index 62d7640d..435836d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-block-4-ins.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-block-4-ins.mir
@@ -17,7 +17,7 @@
 
   declare <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float>, <4 x float>, <4 x float>, i32) #1
 
-  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "use-soft-float"="false" }
   attributes #1 = { nounwind readnone }
   attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-block-elses.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-block-elses.mir
index 130c7f4..dc195dd 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-block-elses.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-block-elses.mir
@@ -17,7 +17,7 @@
 
   declare <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float>, <4 x float>, <4 x float>, i32) #1
 
-  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "use-soft-float"="false" }
   attributes #1 = { nounwind readnone }
   attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir
index 0ffed2e..ee2e58f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir
@@ -18,7 +18,7 @@
   declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>) #2
   declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>) #3
 
-  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fpregs,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fpregs,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "use-soft-float"="false" }
   attributes #1 = { nounwind readnone }
   attributes #2 = { argmemonly nounwind readonly willreturn }
   attributes #3 = { argmemonly nounwind willreturn }
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-block-optnone.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-block-optnone.mir
index 695a8d8..ba21068 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-block-optnone.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-block-optnone.mir
@@ -14,7 +14,7 @@
 
   declare <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float>, <4 x float>, <4 x float>, i32) #1
 
-  attributes #0 = { noinline optnone nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { noinline optnone nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "use-soft-float"="false" }
   attributes #1 = { nounwind readnone }
   attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/Thumb2/pacbti-m-outliner-4.ll b/llvm/test/CodeGen/Thumb2/pacbti-m-outliner-4.ll
index 8777d51..db779de 100644
--- a/llvm/test/CodeGen/Thumb2/pacbti-m-outliner-4.ll
+++ b/llvm/test/CodeGen/Thumb2/pacbti-m-outliner-4.ll
@@ -179,7 +179,7 @@ return:                                           ; preds = %entry, %if.end
 ; CHECK-NOT: aut
 ; CHECK:        b    _Z1hii
 
-attributes #0 = { minsize noinline optsize "sign-return-address"="non-leaf" "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m3" "target-features"="+armv7-m,+hwdiv,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { minsize noinline optsize "sign-return-address"="non-leaf" "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m3" "target-features"="+armv7-m,+hwdiv,+thumb-mode" "use-soft-float"="false" }
 attributes #1 = { nounwind "sign-return-address"="non-leaf" }
 attributes #2 = { noreturn "sign-return-address"="non-leaf" }
 
diff --git a/llvm/test/CodeGen/Thumb2/stack_guard_remat.ll b/llvm/test/CodeGen/Thumb2/stack_guard_remat.ll
index 4a93c2c..0ee075c 100644
--- a/llvm/test/CodeGen/Thumb2/stack_guard_remat.ll
+++ b/llvm/test/CodeGen/Thumb2/stack_guard_remat.ll
@@ -38,7 +38,7 @@ declare void @foo3(ptr)
 ; Function Attrs: nounwind
 declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
 
-attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 !llvm.module.flags = !{!0}
 !0 = !{i32 7, !"PIC Level", i32 2}
diff --git a/llvm/test/CodeGen/Thumb2/t2sizereduction.mir b/llvm/test/CodeGen/Thumb2/t2sizereduction.mir
index 48b75ed5..f5eb642 100644
--- a/llvm/test/CodeGen/Thumb2/t2sizereduction.mir
+++ b/llvm/test/CodeGen/Thumb2/t2sizereduction.mir
@@ -29,7 +29,7 @@
     br i1 %exitcond, label %for.cond.cleanup, label %for.body
   }
 
-  attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m7" "target-features"="-d32,+dsp,+fp-armv8,-fp64,+hwdiv,+strict-align,+thumb-mode,-crc,-dotprod,-hwdiv-arm,-ras" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m7" "target-features"="-d32,+dsp,+fp-armv8,-fp64,+hwdiv,+strict-align,+thumb-mode,-crc,-dotprod,-hwdiv-arm,-ras" "use-soft-float"="false" }
 
 ...
 ---
diff --git a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
index 104ec31..5eb49fd 100644
--- a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
+++ b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
@@ -2103,10 +2103,7 @@ for.body:                                         ; preds = %entry, %for.body
 
 ; CHECK-LABEL: four_floats_same_op:
 ; CHECK: loop
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: f32x4.mul
-; CHECK: v128.store
+; CHECK-NOT: v128.load
 define hidden void @four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp45.not = icmp eq i32 %N, 0
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmax.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmax.ll
index 45f4ddd..f224a0d 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmax.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmax.ll
@@ -54,6 +54,250 @@ define <2 x double> @test_minimumnum_f64x2(<2 x double> %a, <2 x double> %b) {
   ret <2 x double> %result
 }
 
+define <4 x float> @test_pmax_v4f32_olt(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmax_v4f32_olt:
+; CHECK:         .functype test_pmax_v4f32_olt (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp olt <4 x float> %x, %y
+  %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+  ret <4 x float> %a
+}
+
+define <4 x float> @test_pmax_v4f32_ole(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmax_v4f32_ole:
+; CHECK:         .functype test_pmax_v4f32_ole (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp ole <4 x float> %x, %y
+  %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+  ret <4 x float> %a
+}
+
+define <4 x float> @test_pmax_v4f32_ogt(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmax_v4f32_ogt:
+; CHECK:         .functype test_pmax_v4f32_ogt (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp ogt <4 x float> %y, %x
+  %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+  ret <4 x float> %a
+}
+
+define <4 x float> @test_pmax_v4f32_oge(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmax_v4f32_oge:
+; CHECK:         .functype test_pmax_v4f32_oge (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp oge <4 x float> %y, %x
+  %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+  ret <4 x float> %a
+}
+
+; For setlt
+define <4 x float> @pmax_v4f32_fast_olt(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: pmax_v4f32_fast_olt:
+; CHECK:         .functype pmax_v4f32_fast_olt (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp fast olt <4 x float> %x, %y
+  %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+  ret <4 x float> %a
+}
+
+; For setle
+define <4 x float> @test_pmax_v4f32_fast_ole(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmax_v4f32_fast_ole:
+; CHECK:         .functype test_pmax_v4f32_fast_ole (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp fast ole <4 x float> %x, %y
+  %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+  ret <4 x float> %a
+}
+
+; For setgt
+define <4 x float> @test_pmax_v4f32_fast_ogt(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmax_v4f32_fast_ogt:
+; CHECK:         .functype test_pmax_v4f32_fast_ogt (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp fast ogt <4 x float> %x, %y
+  %a = select <4 x i1> %c, <4 x float> %x, <4 x float> %y
+  ret <4 x float> %a
+}
+
+; For setge
+define <4 x float> @test_pmax_v4f32_fast_oge(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmax_v4f32_fast_oge:
+; CHECK:         .functype test_pmax_v4f32_fast_oge (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp fast oge <4 x float> %x, %y
+  %a = select <4 x i1> %c, <4 x float> %x, <4 x float> %y
+  ret <4 x float> %a
+}
+
+define <4 x i32> @test_pmax_int_v4f32(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: test_pmax_int_v4f32:
+; CHECK:         .functype test_pmax_int_v4f32 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    f32x4.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %fx = bitcast <4 x i32> %x to <4 x float>
+  %fy = bitcast <4 x i32> %y to <4 x float>
+  %c = fcmp olt <4 x float> %fy, %fx
+  %a = select <4 x i1> %c, <4 x i32> %x, <4 x i32> %y
+  ret <4 x i32> %a
+}
+
+define <2 x double> @test_pmax_v2f64_olt(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmax_v2f64_olt:
+; CHECK:         .functype test_pmax_v2f64_olt (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp olt <2 x double> %x, %y
+  %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
+  ret <2 x double> %a
+}
+
+define <2 x double> @test_pmax_v2f64_ole(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmax_v2f64_ole:
+; CHECK:         .functype test_pmax_v2f64_ole (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp ole <2 x double> %x, %y
+  %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
+  ret <2 x double> %a
+}
+
+define <2 x double> @test_pmax_v2f64_ogt(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmax_v2f64_ogt:
+; CHECK:         .functype test_pmax_v2f64_ogt (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    f64x2.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp ogt <2 x double> %x, %y
+  %a = select <2 x i1> %c, <2 x double> %x, <2 x double> %y
+  ret <2 x double> %a
+}
+define <2 x double> @test_pmax_v2f64_oge(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmax_v2f64_oge:
+; CHECK:         .functype test_pmax_v2f64_oge (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    f64x2.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp oge <2 x double> %x, %y
+  %a = select <2 x i1> %c, <2 x double> %x, <2 x double> %y
+  ret <2 x double> %a
+}
+
+; For setlt
+define <2 x double> @pmax_v2f64_fast_olt(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: pmax_v2f64_fast_olt:
+; CHECK:         .functype pmax_v2f64_fast_olt (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp fast olt <2 x double> %x, %y
+  %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
+  ret <2 x double> %a
+}
+
+; For setle
+define <2 x double> @test_pmax_v2f64_fast_ole(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmax_v2f64_fast_ole:
+; CHECK:         .functype test_pmax_v2f64_fast_ole (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp fast ole <2 x double> %x, %y
+  %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
+  ret <2 x double> %a
+}
+; For setgt
+define <2 x double> @test_pmax_v2f64_fast_ogt(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmax_v2f64_fast_ogt:
+; CHECK:         .functype test_pmax_v2f64_fast_ogt (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp fast ogt <2 x double> %x, %y
+  %a = select <2 x i1> %c, <2 x double> %x, <2 x double> %y
+  ret <2 x double> %a
+}
+
+; For setge
+define <2 x double> @test_pmax_v2f64_fast_oge(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmax_v2f64_fast_oge:
+; CHECK:         .functype test_pmax_v2f64_fast_oge (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp fast oge <2 x double> %x, %y
+  %a = select <2 x i1> %c, <2 x double> %x, <2 x double> %y
+  ret <2 x double> %a
+}
+
+define <2 x i64> @test_pmax_int_v2f64(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: test_pmax_int_v2f64:
+; CHECK:         .functype test_pmax_int_v2f64 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    f64x2.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %fx = bitcast <2 x i64> %x to <2 x double>
+  %fy = bitcast <2 x i64> %y to <2 x double>
+  %c = fcmp olt <2 x double> %fy, %fx
+  %a = select <2 x i1> %c, <2 x i64> %x, <2 x i64> %y
+  ret <2 x i64> %a
+}
+
 declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
 declare <4 x float> @llvm.maximumnum.v4f32(<4 x float>, <4 x float>)
 declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>)
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmin.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmin.ll
index f3eec02..4604465 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmin.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmin.ll
@@ -53,6 +53,252 @@ define <2 x double> @test_minimumnum_f64x2(<2 x double> %a, <2 x double> %b) {
   ret <2 x double> %result
 }
 
+define <4 x float> @test_pmin_v4f32_olt(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmin_v4f32_olt:
+; CHECK:         .functype test_pmin_v4f32_olt (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp olt <4 x float> %y, %x
+  %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+  ret <4 x float> %a
+}
+
+define <4 x float> @test_pmin_v4f32_ole(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmin_v4f32_ole:
+; CHECK:         .functype test_pmin_v4f32_ole (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp ole <4 x float> %y, %x
+  %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+  ret <4 x float> %a
+}
+
+define <4 x float> @test_pmin_v4f32_ogt(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmin_v4f32_ogt:
+; CHECK:         .functype test_pmin_v4f32_ogt (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp ogt <4 x float> %x, %y
+  %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+  ret <4 x float> %a
+}
+
+define <4 x float> @test_pmin_v4f32_oge(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmin_v4f32_oge:
+; CHECK:         .functype test_pmin_v4f32_oge (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp oge <4 x float> %x, %y
+  %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+  ret <4 x float> %a
+}
+
+; For setlt
+define <4 x float> @pmin_v4f32_fast_olt(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: pmin_v4f32_fast_olt:
+; CHECK:         .functype pmin_v4f32_fast_olt (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    f32x4.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp fast olt <4 x float> %y, %x
+  %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+  ret <4 x float> %a
+}
+
+; For setle
+define <4 x float> @test_pmin_v4f32_fast_ole(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmin_v4f32_fast_ole:
+; CHECK:         .functype test_pmin_v4f32_fast_ole (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    f32x4.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp fast ole <4 x float> %y, %x
+  %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+  ret <4 x float> %a
+}
+
+; For setgt
+define <4 x float> @test_pmin_v4f32_fast_ogt(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmin_v4f32_fast_ogt:
+; CHECK:         .functype test_pmin_v4f32_fast_ogt (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp fast ogt <4 x float> %x, %y
+  %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+  ret <4 x float> %a
+}
+
+; For setge
+define <4 x float> @test_pmin_v4f32_fast_oge(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmin_v4f32_fast_oge:
+; CHECK:         .functype test_pmin_v4f32_fast_oge (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp fast oge <4 x float> %x, %y
+  %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+  ret <4 x float> %a
+}
+
+define <4 x i32> @test_pmin_int_v4f32(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: test_pmin_int_v4f32:
+; CHECK:         .functype test_pmin_int_v4f32 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %fx = bitcast <4 x i32> %x to <4 x float>
+  %fy = bitcast <4 x i32> %y to <4 x float>
+  %c = fcmp olt <4 x float> %fy, %fx
+  %a = select <4 x i1> %c, <4 x i32> %y, <4 x i32> %x
+  ret <4 x i32> %a
+}
+
+define <2 x double> @test_pmin_v2f64_olt(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmin_v2f64_olt:
+; CHECK:         .functype test_pmin_v2f64_olt (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp olt <2 x double> %y, %x
+  %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
+  ret <2 x double> %a
+}
+
+define <2 x double> @test_pmin_v2f64_ole(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmin_v2f64_ole:
+; CHECK:         .functype test_pmin_v2f64_ole (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp ole <2 x double> %y, %x
+  %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
+  ret <2 x double> %a
+}
+
+define <2 x double> @test_pmin_v2f64_ogt(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmin_v2f64_ogt:
+; CHECK:         .functype test_pmin_v2f64_ogt (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp ogt <2 x double> %x, %y
+  %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
+  ret <2 x double> %a
+}
+
+define <2 x double> @test_pmin_v2f64_oge(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmin_v2f64_oge:
+; CHECK:         .functype test_pmin_v2f64_oge (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp oge <2 x double> %x, %y
+  %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
+  ret <2 x double> %a
+}
+
+; For setlt
+define <2 x double> @pmin_v2f64_fast_olt(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: pmin_v2f64_fast_olt:
+; CHECK:         .functype pmin_v2f64_fast_olt (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    f64x2.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp fast olt <2 x double> %y, %x
+  %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
+  ret <2 x double> %a
+}
+
+; For setle
+define <2 x double> @test_pmin_v2f64_fast_ole(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmin_v2f64_fast_ole:
+; CHECK:         .functype test_pmin_v2f64_fast_ole (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    f64x2.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp fast ole <2 x double> %y, %x
+  %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
+  ret <2 x double> %a
+}
+
+; For setgt
+define <2 x double> @test_pmin_v2f64_fast_ogt(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmin_v2f64_fast_ogt:
+; CHECK:         .functype test_pmin_v2f64_fast_ogt (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp fast ogt <2 x double> %x, %y
+  %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
+  ret <2 x double> %a
+}
+
+; For setge
+define <2 x double> @test_pmin_v2f64_fast_oge(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmin_v2f64_fast_oge:
+; CHECK:         .functype test_pmin_v2f64_fast_oge (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp fast oge <2 x double> %x, %y
+  %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
+  ret <2 x double> %a
+}
+
+define <2 x i64> @test_pmin_int_v2f64(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: test_pmin_int_v2f64:
+; CHECK:         .functype test_pmin_int_v2f64 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %fx = bitcast <2 x i64> %x to <2 x double>
+  %fy = bitcast <2 x i64> %y to <2 x double>
+  %c = fcmp olt <2 x double> %fy, %fx
+  %a = select <2 x i1> %c, <2 x i64> %y, <2 x i64> %x
+  ret <2 x i64> %a
+}
+
 declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
 declare <4 x float> @llvm.fminimumnum.v4f32(<4 x float>, <4 x float>)
 declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>)
diff --git a/llvm/test/CodeGen/X86/2013-03-13-VEX-DestReg.ll b/llvm/test/CodeGen/X86/2013-03-13-VEX-DestReg.ll
index f82cd11..f12fc4a8 100644
--- a/llvm/test/CodeGen/X86/2013-03-13-VEX-DestReg.ll
+++ b/llvm/test/CodeGen/X86/2013-03-13-VEX-DestReg.ll
@@ -23,6 +23,6 @@ entry:
 
 declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) #1
 
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/GlobalISel/x86_64-legalize-sitofp.mir b/llvm/test/CodeGen/X86/GlobalISel/x86_64-legalize-sitofp.mir
index 64e569c..3d40240 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/x86_64-legalize-sitofp.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/x86_64-legalize-sitofp.mir
@@ -63,7 +63,7 @@
     ret double %conv
   }
 
-  attributes #0 = { norecurse nounwind readnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { norecurse nounwind readnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 
   !llvm.module.flags = !{!0}
   !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/X86/GlobalISel/x86_64-select-sitofp.mir b/llvm/test/CodeGen/X86/GlobalISel/x86_64-select-sitofp.mir
index 0bb6061..8f76ad5 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/x86_64-select-sitofp.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/x86_64-select-sitofp.mir
@@ -35,7 +35,7 @@
     ret double %conv
   }
 
-  attributes #0 = { norecurse nounwind readnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { norecurse nounwind readnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 
   !llvm.module.flags = !{!0}
   !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/X86/StackColoring-use-between-allocas.mir b/llvm/test/CodeGen/X86/StackColoring-use-between-allocas.mir
index a992222..1612485 100644
--- a/llvm/test/CodeGen/X86/StackColoring-use-between-allocas.mir
+++ b/llvm/test/CodeGen/X86/StackColoring-use-between-allocas.mir
@@ -64,9 +64,9 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #3
 
-  attributes #0 = { ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="64" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="64" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "use-soft-float"="false" }
   attributes #1 = { argmemonly nounwind willreturn }
-  attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "use-soft-float"="false" }
   attributes #3 = { nounwind }
 
   !llvm.module.flags = !{!0, !1}
diff --git a/llvm/test/CodeGen/X86/atom-fixup-lea4.ll b/llvm/test/CodeGen/X86/atom-fixup-lea4.ll
index 8e7a463..69689f0 100644
--- a/llvm/test/CodeGen/X86/atom-fixup-lea4.ll
+++ b/llvm/test/CodeGen/X86/atom-fixup-lea4.ll
@@ -18,5 +18,5 @@ entry:
 ; Function Attrs: uwtable
 declare void @_ZN12ValueWrapperIS_IS_IdEEEC2Ev(ptr) unnamed_addr #0 align 2
 
-attributes #0 = { uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 45277ce..3e7b73a 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,CHECK-O3
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-O3
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-O3
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-O3
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-SSE-O3
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX-O3
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX-O3
 ; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,CHECK-O0
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-O0
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-O0
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-O0
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-SSE-O0
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX-O0
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX-O0
 
 define void @test1(ptr %ptr, i32 %val1) {
 ; CHECK-LABEL: test1:
@@ -34,6 +34,943 @@ define i32 @test3(ptr %ptr) {
   %val = load atomic i32, ptr %ptr seq_cst, align 4
   ret i32 %val
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-O0: {{.*}}
-; CHECK-O3: {{.*}}
+
+define <1 x i32> @atomic_vec1_i32(ptr %x) {
+; CHECK-LABEL: atomic_vec1_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl (%rdi), %eax
+; CHECK-NEXT:    retq
+  %ret = load atomic <1 x i32>, ptr %x acquire, align 4
+  ret <1 x i32> %ret
+}
+
+define <1 x i8> @atomic_vec1_i8(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec1_i8:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    movzbl (%rdi), %eax
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_i8:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    movzbl (%rdi), %eax
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_i8:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    movzbl (%rdi), %eax
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec1_i8:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    movb (%rdi), %al
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_i8:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    movb (%rdi), %al
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_i8:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    movb (%rdi), %al
+; CHECK-AVX-O0-NEXT:    retq
+  %ret = load atomic <1 x i8>, ptr %x acquire, align 1
+  ret <1 x i8> %ret
+}
+
+define <1 x i16> @atomic_vec1_i16(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec1_i16:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    movzwl (%rdi), %eax
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_i16:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    movzwl (%rdi), %eax
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_i16:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    movzwl (%rdi), %eax
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec1_i16:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    movw (%rdi), %ax
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_i16:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    movw (%rdi), %ax
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_i16:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    movw (%rdi), %ax
+; CHECK-AVX-O0-NEXT:    retq
+  %ret = load atomic <1 x i16>, ptr %x acquire, align 2
+  ret <1 x i16> %ret
+}
+
+define <1 x i32> @atomic_vec1_i8_zext(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec1_i8_zext:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    movzbl (%rdi), %eax
+; CHECK-O3-NEXT:    movzbl %al, %eax
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_i8_zext:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    movzbl (%rdi), %eax
+; CHECK-SSE-O3-NEXT:    movzbl %al, %eax
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_i8_zext:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    movzbl (%rdi), %eax
+; CHECK-AVX-O3-NEXT:    movzbl %al, %eax
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec1_i8_zext:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    movb (%rdi), %al
+; CHECK-O0-NEXT:    movzbl %al, %eax
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_i8_zext:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    movb (%rdi), %al
+; CHECK-SSE-O0-NEXT:    movzbl %al, %eax
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_i8_zext:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    movb (%rdi), %al
+; CHECK-AVX-O0-NEXT:    movzbl %al, %eax
+; CHECK-AVX-O0-NEXT:    retq
+  %ret = load atomic <1 x i8>, ptr %x acquire, align 1
+  %zret = zext <1 x i8> %ret to <1 x i32>
+  ret <1 x i32> %zret
+}
+
+define <1 x i64> @atomic_vec1_i16_sext(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec1_i16_sext:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    movzwl (%rdi), %eax
+; CHECK-O3-NEXT:    movswq %ax, %rax
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_i16_sext:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    movzwl (%rdi), %eax
+; CHECK-SSE-O3-NEXT:    movswq %ax, %rax
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_i16_sext:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    movzwl (%rdi), %eax
+; CHECK-AVX-O3-NEXT:    movswq %ax, %rax
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec1_i16_sext:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    movw (%rdi), %ax
+; CHECK-O0-NEXT:    movswq %ax, %rax
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_i16_sext:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    movw (%rdi), %ax
+; CHECK-SSE-O0-NEXT:    movswq %ax, %rax
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_i16_sext:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    movw (%rdi), %ax
+; CHECK-AVX-O0-NEXT:    movswq %ax, %rax
+; CHECK-AVX-O0-NEXT:    retq
+  %ret = load atomic <1 x i16>, ptr %x acquire, align 2
+  %sret = sext <1 x i16> %ret to <1 x i64>
+  ret <1 x i64> %sret
+}
+
+define <1 x ptr addrspace(270)> @atomic_vec1_ptr270(ptr %x) {
+; CHECK-LABEL: atomic_vec1_ptr270:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl (%rdi), %eax
+; CHECK-NEXT:    retq
+  %ret = load atomic <1 x ptr addrspace(270)>, ptr %x acquire, align 4
+  ret <1 x ptr addrspace(270)> %ret
+}
+
+define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec1_bfloat:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    movzwl (%rdi), %eax
+; CHECK-O3-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_bfloat:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    movzwl (%rdi), %eax
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_bfloat:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    movzwl (%rdi), %eax
+; CHECK-AVX-O3-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec1_bfloat:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    movw (%rdi), %cx
+; CHECK-O0-NEXT:    # implicit-def: $eax
+; CHECK-O0-NEXT:    movw %cx, %ax
+; CHECK-O0-NEXT:    # implicit-def: $xmm0
+; CHECK-O0-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_bfloat:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    movw (%rdi), %cx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $eax
+; CHECK-SSE-O0-NEXT:    movw %cx, %ax
+; CHECK-SSE-O0-NEXT:    # implicit-def: $xmm0
+; CHECK-SSE-O0-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_bfloat:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    movw (%rdi), %cx
+; CHECK-AVX-O0-NEXT:    # implicit-def: $eax
+; CHECK-AVX-O0-NEXT:    movw %cx, %ax
+; CHECK-AVX-O0-NEXT:    # implicit-def: $xmm0
+; CHECK-AVX-O0-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-AVX-O0-NEXT:    retq
+  %ret = load atomic <1 x bfloat>, ptr %x acquire, align 2
+  ret <1 x bfloat> %ret
+}
+
+define <1 x ptr> @atomic_vec1_ptr_align(ptr %x) nounwind {
+; CHECK-LABEL: atomic_vec1_ptr_align:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq (%rdi), %rax
+; CHECK-NEXT:    retq
+  %ret = load atomic <1 x ptr>, ptr %x acquire, align 8
+  ret <1 x ptr> %ret
+}
+
+define <1 x i64> @atomic_vec1_i64_align(ptr %x) nounwind {
+; CHECK-LABEL: atomic_vec1_i64_align:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq (%rdi), %rax
+; CHECK-NEXT:    retq
+  %ret = load atomic <1 x i64>, ptr %x acquire, align 8
+  ret <1 x i64> %ret
+}
+
+define <1 x ptr> @atomic_vec1_ptr(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec1_ptr:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    pushq %rax
+; CHECK-O3-NEXT:    movq %rdi, %rsi
+; CHECK-O3-NEXT:    movq %rsp, %rdx
+; CHECK-O3-NEXT:    movl $8, %edi
+; CHECK-O3-NEXT:    movl $2, %ecx
+; CHECK-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-O3-NEXT:    movq (%rsp), %rax
+; CHECK-O3-NEXT:    popq %rcx
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_ptr:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    pushq %rax
+; CHECK-SSE-O3-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT:    movl $8, %edi
+; CHECK-SSE-O3-NEXT:    movl $2, %ecx
+; CHECK-SSE-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT:    movq (%rsp), %rax
+; CHECK-SSE-O3-NEXT:    popq %rcx
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_ptr:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    pushq %rax
+; CHECK-AVX-O3-NEXT:    movq %rdi, %rsi
+; CHECK-AVX-O3-NEXT:    movq %rsp, %rdx
+; CHECK-AVX-O3-NEXT:    movl $8, %edi
+; CHECK-AVX-O3-NEXT:    movl $2, %ecx
+; CHECK-AVX-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX-O3-NEXT:    movq (%rsp), %rax
+; CHECK-AVX-O3-NEXT:    popq %rcx
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec1_ptr:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    pushq %rax
+; CHECK-O0-NEXT:    movq %rdi, %rsi
+; CHECK-O0-NEXT:    movl $8, %edi
+; CHECK-O0-NEXT:    movq %rsp, %rdx
+; CHECK-O0-NEXT:    movl $2, %ecx
+; CHECK-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-O0-NEXT:    movq (%rsp), %rax
+; CHECK-O0-NEXT:    popq %rcx
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_ptr:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    pushq %rax
+; CHECK-SSE-O0-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT:    movl $8, %edi
+; CHECK-SSE-O0-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT:    movl $2, %ecx
+; CHECK-SSE-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT:    movq (%rsp), %rax
+; CHECK-SSE-O0-NEXT:    popq %rcx
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_ptr:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    pushq %rax
+; CHECK-AVX-O0-NEXT:    movq %rdi, %rsi
+; CHECK-AVX-O0-NEXT:    movl $8, %edi
+; CHECK-AVX-O0-NEXT:    movq %rsp, %rdx
+; CHECK-AVX-O0-NEXT:    movl $2, %ecx
+; CHECK-AVX-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX-O0-NEXT:    movq (%rsp), %rax
+; CHECK-AVX-O0-NEXT:    popq %rcx
+; CHECK-AVX-O0-NEXT:    retq
+  %ret = load atomic <1 x ptr>, ptr %x acquire, align 4
+  ret <1 x ptr> %ret
+}
+
+define <1 x half> @atomic_vec1_half(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec1_half:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    movzwl (%rdi), %eax
+; CHECK-O3-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_half:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    movzwl (%rdi), %eax
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_half:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    movzwl (%rdi), %eax
+; CHECK-AVX-O3-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec1_half:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    movw (%rdi), %cx
+; CHECK-O0-NEXT:    # implicit-def: $eax
+; CHECK-O0-NEXT:    movw %cx, %ax
+; CHECK-O0-NEXT:    # implicit-def: $xmm0
+; CHECK-O0-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_half:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    movw (%rdi), %cx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $eax
+; CHECK-SSE-O0-NEXT:    movw %cx, %ax
+; CHECK-SSE-O0-NEXT:    # implicit-def: $xmm0
+; CHECK-SSE-O0-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_half:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    movw (%rdi), %cx
+; CHECK-AVX-O0-NEXT:    # implicit-def: $eax
+; CHECK-AVX-O0-NEXT:    movw %cx, %ax
+; CHECK-AVX-O0-NEXT:    # implicit-def: $xmm0
+; CHECK-AVX-O0-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-AVX-O0-NEXT:    retq
+  %ret = load atomic <1 x half>, ptr %x acquire, align 2
+  ret <1 x half> %ret
+}
+
+define <1 x float> @atomic_vec1_float(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec1_float:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_float:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_float:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec1_float:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_float:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_float:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX-O0-NEXT:    retq
+  %ret = load atomic <1 x float>, ptr %x acquire, align 4
+  ret <1 x float> %ret
+}
+
+define <1 x double> @atomic_vec1_double_align(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec1_double_align:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_double_align:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_double_align:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec1_double_align:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_double_align:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_double_align:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-AVX-O0-NEXT:    retq
+  %ret = load atomic <1 x double>, ptr %x acquire, align 8
+  ret <1 x double> %ret
+}
+
+define <1 x i64> @atomic_vec1_i64(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec1_i64:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    pushq %rax
+; CHECK-O3-NEXT:    movq %rdi, %rsi
+; CHECK-O3-NEXT:    movq %rsp, %rdx
+; CHECK-O3-NEXT:    movl $8, %edi
+; CHECK-O3-NEXT:    movl $2, %ecx
+; CHECK-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-O3-NEXT:    movq (%rsp), %rax
+; CHECK-O3-NEXT:    popq %rcx
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_i64:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    pushq %rax
+; CHECK-SSE-O3-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT:    movl $8, %edi
+; CHECK-SSE-O3-NEXT:    movl $2, %ecx
+; CHECK-SSE-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT:    movq (%rsp), %rax
+; CHECK-SSE-O3-NEXT:    popq %rcx
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_i64:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    pushq %rax
+; CHECK-AVX-O3-NEXT:    movq %rdi, %rsi
+; CHECK-AVX-O3-NEXT:    movq %rsp, %rdx
+; CHECK-AVX-O3-NEXT:    movl $8, %edi
+; CHECK-AVX-O3-NEXT:    movl $2, %ecx
+; CHECK-AVX-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX-O3-NEXT:    movq (%rsp), %rax
+; CHECK-AVX-O3-NEXT:    popq %rcx
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec1_i64:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    pushq %rax
+; CHECK-O0-NEXT:    movq %rdi, %rsi
+; CHECK-O0-NEXT:    movl $8, %edi
+; CHECK-O0-NEXT:    movq %rsp, %rdx
+; CHECK-O0-NEXT:    movl $2, %ecx
+; CHECK-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-O0-NEXT:    movq (%rsp), %rax
+; CHECK-O0-NEXT:    popq %rcx
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_i64:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    pushq %rax
+; CHECK-SSE-O0-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT:    movl $8, %edi
+; CHECK-SSE-O0-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT:    movl $2, %ecx
+; CHECK-SSE-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT:    movq (%rsp), %rax
+; CHECK-SSE-O0-NEXT:    popq %rcx
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_i64:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    pushq %rax
+; CHECK-AVX-O0-NEXT:    movq %rdi, %rsi
+; CHECK-AVX-O0-NEXT:    movl $8, %edi
+; CHECK-AVX-O0-NEXT:    movq %rsp, %rdx
+; CHECK-AVX-O0-NEXT:    movl $2, %ecx
+; CHECK-AVX-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX-O0-NEXT:    movq (%rsp), %rax
+; CHECK-AVX-O0-NEXT:    popq %rcx
+; CHECK-AVX-O0-NEXT:    retq
+  %ret = load atomic <1 x i64>, ptr %x acquire, align 4
+  ret <1 x i64> %ret
+}
+
+define <1 x double> @atomic_vec1_double(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec1_double:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    pushq %rax
+; CHECK-O3-NEXT:    movq %rdi, %rsi
+; CHECK-O3-NEXT:    movq %rsp, %rdx
+; CHECK-O3-NEXT:    movl $8, %edi
+; CHECK-O3-NEXT:    movl $2, %ecx
+; CHECK-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-O3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-O3-NEXT:    popq %rax
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_double:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    pushq %rax
+; CHECK-SSE-O3-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT:    movl $8, %edi
+; CHECK-SSE-O3-NEXT:    movl $2, %ecx
+; CHECK-SSE-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-SSE-O3-NEXT:    popq %rax
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_double:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    pushq %rax
+; CHECK-AVX-O3-NEXT:    movq %rdi, %rsi
+; CHECK-AVX-O3-NEXT:    movq %rsp, %rdx
+; CHECK-AVX-O3-NEXT:    movl $8, %edi
+; CHECK-AVX-O3-NEXT:    movl $2, %ecx
+; CHECK-AVX-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX-O3-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-AVX-O3-NEXT:    popq %rax
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec1_double:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    pushq %rax
+; CHECK-O0-NEXT:    movq %rdi, %rsi
+; CHECK-O0-NEXT:    movl $8, %edi
+; CHECK-O0-NEXT:    movq %rsp, %rdx
+; CHECK-O0-NEXT:    movl $2, %ecx
+; CHECK-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-O0-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-O0-NEXT:    popq %rax
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_double:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    pushq %rax
+; CHECK-SSE-O0-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT:    movl $8, %edi
+; CHECK-SSE-O0-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT:    movl $2, %ecx
+; CHECK-SSE-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-SSE-O0-NEXT:    popq %rax
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_double:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    pushq %rax
+; CHECK-AVX-O0-NEXT:    movq %rdi, %rsi
+; CHECK-AVX-O0-NEXT:    movl $8, %edi
+; CHECK-AVX-O0-NEXT:    movq %rsp, %rdx
+; CHECK-AVX-O0-NEXT:    movl $2, %ecx
+; CHECK-AVX-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX-O0-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-AVX-O0-NEXT:    popq %rax
+; CHECK-AVX-O0-NEXT:    retq
+  %ret = load atomic <1 x double>, ptr %x acquire, align 4
+  ret <1 x double> %ret
+}
+
+define <2 x i32> @atomic_vec2_i32(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec2_i32:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    pushq %rax
+; CHECK-O3-NEXT:    movq %rdi, %rsi
+; CHECK-O3-NEXT:    movq %rsp, %rdx
+; CHECK-O3-NEXT:    movl $8, %edi
+; CHECK-O3-NEXT:    movl $2, %ecx
+; CHECK-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-O3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-O3-NEXT:    popq %rax
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec2_i32:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    pushq %rax
+; CHECK-SSE-O3-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT:    movl $8, %edi
+; CHECK-SSE-O3-NEXT:    movl $2, %ecx
+; CHECK-SSE-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-SSE-O3-NEXT:    popq %rax
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec2_i32:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    pushq %rax
+; CHECK-AVX-O3-NEXT:    movq %rdi, %rsi
+; CHECK-AVX-O3-NEXT:    movq %rsp, %rdx
+; CHECK-AVX-O3-NEXT:    movl $8, %edi
+; CHECK-AVX-O3-NEXT:    movl $2, %ecx
+; CHECK-AVX-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX-O3-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-AVX-O3-NEXT:    popq %rax
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec2_i32:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    pushq %rax
+; CHECK-O0-NEXT:    movq %rdi, %rsi
+; CHECK-O0-NEXT:    movl $8, %edi
+; CHECK-O0-NEXT:    movq %rsp, %rdx
+; CHECK-O0-NEXT:    movl $2, %ecx
+; CHECK-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-O0-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-O0-NEXT:    popq %rax
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec2_i32:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    pushq %rax
+; CHECK-SSE-O0-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT:    movl $8, %edi
+; CHECK-SSE-O0-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT:    movl $2, %ecx
+; CHECK-SSE-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-SSE-O0-NEXT:    popq %rax
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec2_i32:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    pushq %rax
+; CHECK-AVX-O0-NEXT:    movq %rdi, %rsi
+; CHECK-AVX-O0-NEXT:    movl $8, %edi
+; CHECK-AVX-O0-NEXT:    movq %rsp, %rdx
+; CHECK-AVX-O0-NEXT:    movl $2, %ecx
+; CHECK-AVX-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX-O0-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-AVX-O0-NEXT:    popq %rax
+; CHECK-AVX-O0-NEXT:    retq
+  %ret = load atomic <2 x i32>, ptr %x acquire, align 4
+  ret <2 x i32> %ret
+}
+
+define <4 x float> @atomic_vec4_float(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec4_float:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    subq $24, %rsp
+; CHECK-O3-NEXT:    movq %rdi, %rsi
+; CHECK-O3-NEXT:    movq %rsp, %rdx
+; CHECK-O3-NEXT:    movl $16, %edi
+; CHECK-O3-NEXT:    movl $2, %ecx
+; CHECK-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-O3-NEXT:    movaps (%rsp), %xmm0
+; CHECK-O3-NEXT:    addq $24, %rsp
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec4_float:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    subq $24, %rsp
+; CHECK-SSE-O3-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT:    movl $16, %edi
+; CHECK-SSE-O3-NEXT:    movl $2, %ecx
+; CHECK-SSE-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT:    movaps (%rsp), %xmm0
+; CHECK-SSE-O3-NEXT:    addq $24, %rsp
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec4_float:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    subq $24, %rsp
+; CHECK-AVX-O3-NEXT:    movq %rdi, %rsi
+; CHECK-AVX-O3-NEXT:    movq %rsp, %rdx
+; CHECK-AVX-O3-NEXT:    movl $16, %edi
+; CHECK-AVX-O3-NEXT:    movl $2, %ecx
+; CHECK-AVX-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX-O3-NEXT:    vmovaps (%rsp), %xmm0
+; CHECK-AVX-O3-NEXT:    addq $24, %rsp
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec4_float:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    subq $24, %rsp
+; CHECK-O0-NEXT:    movq %rdi, %rsi
+; CHECK-O0-NEXT:    movl $16, %edi
+; CHECK-O0-NEXT:    movq %rsp, %rdx
+; CHECK-O0-NEXT:    movl $2, %ecx
+; CHECK-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-O0-NEXT:    movaps (%rsp), %xmm0
+; CHECK-O0-NEXT:    addq $24, %rsp
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec4_float:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    subq $24, %rsp
+; CHECK-SSE-O0-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT:    movl $16, %edi
+; CHECK-SSE-O0-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT:    movl $2, %ecx
+; CHECK-SSE-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT:    movaps (%rsp), %xmm0
+; CHECK-SSE-O0-NEXT:    addq $24, %rsp
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec4_float:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    subq $24, %rsp
+; CHECK-AVX-O0-NEXT:    movq %rdi, %rsi
+; CHECK-AVX-O0-NEXT:    movl $16, %edi
+; CHECK-AVX-O0-NEXT:    movq %rsp, %rdx
+; CHECK-AVX-O0-NEXT:    movl $2, %ecx
+; CHECK-AVX-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX-O0-NEXT:    vmovaps (%rsp), %xmm0
+; CHECK-AVX-O0-NEXT:    addq $24, %rsp
+; CHECK-AVX-O0-NEXT:    retq
+  %ret = load atomic <4 x float>, ptr %x acquire, align 4
+  ret <4 x float> %ret
+}
+
+define <8 x double> @atomic_vec8_double(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec8_double:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    subq $72, %rsp
+; CHECK-O3-NEXT:    movq %rdi, %rsi
+; CHECK-O3-NEXT:    movq %rsp, %rdx
+; CHECK-O3-NEXT:    movl $64, %edi
+; CHECK-O3-NEXT:    movl $2, %ecx
+; CHECK-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-O3-NEXT:    movaps (%rsp), %xmm0
+; CHECK-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
+; CHECK-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
+; CHECK-O3-NEXT:    addq $72, %rsp
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec8_double:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    subq $72, %rsp
+; CHECK-SSE-O3-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT:    movl $64, %edi
+; CHECK-SSE-O3-NEXT:    movl $2, %ecx
+; CHECK-SSE-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT:    movaps (%rsp), %xmm0
+; CHECK-SSE-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-SSE-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
+; CHECK-SSE-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
+; CHECK-SSE-O3-NEXT:    addq $72, %rsp
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec8_double:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    subq $72, %rsp
+; CHECK-O0-NEXT:    movq %rdi, %rsi
+; CHECK-O0-NEXT:    movl $64, %edi
+; CHECK-O0-NEXT:    movq %rsp, %rdx
+; CHECK-O0-NEXT:    movl $2, %ecx
+; CHECK-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-O0-NEXT:    movapd (%rsp), %xmm0
+; CHECK-O0-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
+; CHECK-O0-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
+; CHECK-O0-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
+; CHECK-O0-NEXT:    addq $72, %rsp
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec8_double:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    subq $72, %rsp
+; CHECK-SSE-O0-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT:    movl $64, %edi
+; CHECK-SSE-O0-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT:    movl $2, %ecx
+; CHECK-SSE-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT:    movapd (%rsp), %xmm0
+; CHECK-SSE-O0-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
+; CHECK-SSE-O0-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
+; CHECK-SSE-O0-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
+; CHECK-SSE-O0-NEXT:    addq $72, %rsp
+; CHECK-SSE-O0-NEXT:    retq
+  %ret = load atomic <8 x double>, ptr %x acquire, align 4
+  ret <8 x double> %ret
+}
+
+define <16 x bfloat> @atomic_vec16_bfloat(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec16_bfloat:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    subq $40, %rsp
+; CHECK-O3-NEXT:    movq %rdi, %rsi
+; CHECK-O3-NEXT:    movq %rsp, %rdx
+; CHECK-O3-NEXT:    movl $32, %edi
+; CHECK-O3-NEXT:    movl $2, %ecx
+; CHECK-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-O3-NEXT:    movaps (%rsp), %xmm0
+; CHECK-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-O3-NEXT:    addq $40, %rsp
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec16_bfloat:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    subq $40, %rsp
+; CHECK-SSE-O3-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT:    movl $32, %edi
+; CHECK-SSE-O3-NEXT:    movl $2, %ecx
+; CHECK-SSE-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT:    movaps (%rsp), %xmm0
+; CHECK-SSE-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-SSE-O3-NEXT:    addq $40, %rsp
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec16_bfloat:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    subq $40, %rsp
+; CHECK-AVX-O3-NEXT:    movq %rdi, %rsi
+; CHECK-AVX-O3-NEXT:    movq %rsp, %rdx
+; CHECK-AVX-O3-NEXT:    movl $32, %edi
+; CHECK-AVX-O3-NEXT:    movl $2, %ecx
+; CHECK-AVX-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX-O3-NEXT:    vmovups (%rsp), %ymm0
+; CHECK-AVX-O3-NEXT:    addq $40, %rsp
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec16_bfloat:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    subq $40, %rsp
+; CHECK-O0-NEXT:    movq %rdi, %rsi
+; CHECK-O0-NEXT:    movl $32, %edi
+; CHECK-O0-NEXT:    movq %rsp, %rdx
+; CHECK-O0-NEXT:    movl $2, %ecx
+; CHECK-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-O0-NEXT:    movaps (%rsp), %xmm0
+; CHECK-O0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-O0-NEXT:    addq $40, %rsp
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec16_bfloat:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    subq $40, %rsp
+; CHECK-SSE-O0-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT:    movl $32, %edi
+; CHECK-SSE-O0-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT:    movl $2, %ecx
+; CHECK-SSE-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT:    movaps (%rsp), %xmm0
+; CHECK-SSE-O0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-SSE-O0-NEXT:    addq $40, %rsp
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec16_bfloat:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    subq $40, %rsp
+; CHECK-AVX-O0-NEXT:    movq %rdi, %rsi
+; CHECK-AVX-O0-NEXT:    movl $32, %edi
+; CHECK-AVX-O0-NEXT:    movq %rsp, %rdx
+; CHECK-AVX-O0-NEXT:    movl $2, %ecx
+; CHECK-AVX-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX-O0-NEXT:    vmovups (%rsp), %ymm0
+; CHECK-AVX-O0-NEXT:    addq $40, %rsp
+; CHECK-AVX-O0-NEXT:    retq
+  %ret = load atomic <16 x bfloat>, ptr %x acquire, align 4
+  ret <16 x bfloat> %ret
+}
+
+define <32 x half> @atomic_vec32_half(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec32_half:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    subq $72, %rsp
+; CHECK-O3-NEXT:    movq %rdi, %rsi
+; CHECK-O3-NEXT:    movq %rsp, %rdx
+; CHECK-O3-NEXT:    movl $64, %edi
+; CHECK-O3-NEXT:    movl $2, %ecx
+; CHECK-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-O3-NEXT:    movaps (%rsp), %xmm0
+; CHECK-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
+; CHECK-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
+; CHECK-O3-NEXT:    addq $72, %rsp
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec32_half:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    subq $72, %rsp
+; CHECK-SSE-O3-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT:    movl $64, %edi
+; CHECK-SSE-O3-NEXT:    movl $2, %ecx
+; CHECK-SSE-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT:    movaps (%rsp), %xmm0
+; CHECK-SSE-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-SSE-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
+; CHECK-SSE-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
+; CHECK-SSE-O3-NEXT:    addq $72, %rsp
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec32_half:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    subq $72, %rsp
+; CHECK-O0-NEXT:    movq %rdi, %rsi
+; CHECK-O0-NEXT:    movl $64, %edi
+; CHECK-O0-NEXT:    movq %rsp, %rdx
+; CHECK-O0-NEXT:    movl $2, %ecx
+; CHECK-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-O0-NEXT:    movaps (%rsp), %xmm0
+; CHECK-O0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-O0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
+; CHECK-O0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
+; CHECK-O0-NEXT:    addq $72, %rsp
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec32_half:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    subq $72, %rsp
+; CHECK-SSE-O0-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT:    movl $64, %edi
+; CHECK-SSE-O0-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT:    movl $2, %ecx
+; CHECK-SSE-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT:    movaps (%rsp), %xmm0
+; CHECK-SSE-O0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-SSE-O0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
+; CHECK-SSE-O0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
+; CHECK-SSE-O0-NEXT:    addq $72, %rsp
+; CHECK-SSE-O0-NEXT:    retq
+  %ret = load atomic <32 x half>, ptr %x acquire, align 4
+  ret <32 x half> %ret
+}
diff --git a/llvm/test/CodeGen/X86/avoid-sfb-g-no-change.mir b/llvm/test/CodeGen/X86/avoid-sfb-g-no-change.mir
index 99fee27..88d7682 100644
--- a/llvm/test/CodeGen/X86/avoid-sfb-g-no-change.mir
+++ b/llvm/test/CodeGen/X86/avoid-sfb-g-no-change.mir
@@ -52,7 +52,7 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #2
 
-  attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
   attributes #1 = { nounwind readnone speculatable }
   attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/X86/avoid-sfb-g-no-change2.mir b/llvm/test/CodeGen/X86/avoid-sfb-g-no-change2.mir
index 50b2433..8dbd4e2 100644
--- a/llvm/test/CodeGen/X86/avoid-sfb-g-no-change2.mir
+++ b/llvm/test/CodeGen/X86/avoid-sfb-g-no-change2.mir
@@ -63,7 +63,7 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #2
 
-  attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
   attributes #1 = { nounwind readnone speculatable }
   attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/X86/avoid-sfb-g-no-change3.mir b/llvm/test/CodeGen/X86/avoid-sfb-g-no-change3.mir
index 7a4b993..c0924ea 100644
--- a/llvm/test/CodeGen/X86/avoid-sfb-g-no-change3.mir
+++ b/llvm/test/CodeGen/X86/avoid-sfb-g-no-change3.mir
@@ -73,7 +73,7 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #2
   
-  attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
   attributes #1 = { nounwind readnone speculatable }
   attributes #2 = { nounwind }
   
diff --git a/llvm/test/CodeGen/X86/avoid-sfb-overlaps.ll b/llvm/test/CodeGen/X86/avoid-sfb-overlaps.ll
index da9d16c..f074390 100644
--- a/llvm/test/CodeGen/X86/avoid-sfb-overlaps.ll
+++ b/llvm/test/CodeGen/X86/avoid-sfb-overlaps.ll
@@ -502,6 +502,6 @@ entry:
   ret void
 }
 
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 
 attributes #1 = { argmemonly nounwind }
diff --git a/llvm/test/CodeGen/X86/avx512-cmp-kor-sequence.ll b/llvm/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
index b4ba239..7fd4f59 100644
--- a/llvm/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
+++ b/llvm/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
@@ -48,5 +48,5 @@ entry:
 ; Function Attrs: nounwind readnone
 declare <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, <16 x i1>, i32) #1
 
-attributes #0 = { nounwind readnone uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="broadwell" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512f,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+evex512,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-vzeroupper" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readnone uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="broadwell" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512f,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+evex512,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-vzeroupper" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/X86/basic-block-address-map-empty-function.ll b/llvm/test/CodeGen/X86/basic-block-address-map-empty-function.ll
index 4e76262..423e318 100644
--- a/llvm/test/CodeGen/X86/basic-block-address-map-empty-function.ll
+++ b/llvm/test/CodeGen/X86/basic-block-address-map-empty-function.ll
@@ -19,7 +19,7 @@ entry:
 ; CHECK:	func:
 ; CHECK:	.Lfunc_begin1:
 ; CHECK:		.section	.llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text{{$}}
-; CHECK-NEXT:		.byte 3			# version
+; CHECK-NEXT:		.byte 4			# version
 ; BASIC-NEXT:		.byte 0			# feature
 ; PGO-NEXT:		.byte 3			# feature
 ; CHECK-NEXT:		.quad	.Lfunc_begin1	# function address
diff --git a/llvm/test/CodeGen/X86/basic-block-address-map-function-sections.ll b/llvm/test/CodeGen/X86/basic-block-address-map-function-sections.ll
index f610b04..e32e522 100644
--- a/llvm/test/CodeGen/X86/basic-block-address-map-function-sections.ll
+++ b/llvm/test/CodeGen/X86/basic-block-address-map-function-sections.ll
@@ -10,7 +10,7 @@ define dso_local i32 @_Z3barv() {
 ; CHECK-LABEL:	_Z3barv:
 ; CHECK-NEXT:	[[BAR_BEGIN:.Lfunc_begin[0-9]+]]:
 ; CHECK:		.section .llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text._Z3barv{{$}}
-; CHECK-NEXT:		.byte 3			# version
+; CHECK-NEXT:		.byte 4			# version
 ; CHECK-NEXT:		.byte 0			# feature
 ; CHECK-NEXT:		.quad [[BAR_BEGIN]]	# function address
 
@@ -23,7 +23,7 @@ define dso_local i32 @_Z3foov() {
 ; CHECK-LABEL:	_Z3foov:
 ; CHECK-NEXT:	[[FOO_BEGIN:.Lfunc_begin[0-9]+]]:
 ; CHECK:		.section  .llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text._Z3foov{{$}}
-; CHECK-NEXT:		.byte 3			# version
+; CHECK-NEXT:		.byte 4			# version
 ; CHECK-NEXT:		.byte 32                # feature
 ; CHECK-NEXT:		.quad [[FOO_BEGIN]]	# function address
 
@@ -36,6 +36,6 @@ define linkonce_odr dso_local i32 @_Z4fooTIiET_v() comdat {
 ; CHECK-LABEL:	_Z4fooTIiET_v:
 ; CHECK-NEXT:	[[FOOCOMDAT_BEGIN:.Lfunc_begin[0-9]+]]:
 ; CHECK:		.section .llvm_bb_addr_map,"oG",@llvm_bb_addr_map,.text._Z4fooTIiET_v,_Z4fooTIiET_v,comdat{{$}}
-; CHECK-NEXT:		.byte 3				# version
+; CHECK-NEXT:		.byte 4				# version
 ; CHECK-NEXT:		.byte 0				# feature
 ; CHECK-NEXT:		.quad [[FOOCOMDAT_BEGIN]]	# function address
diff --git a/llvm/test/CodeGen/X86/basic-block-address-map-pgo-features.ll b/llvm/test/CodeGen/X86/basic-block-address-map-pgo-features.ll
index ba76f3e..12b1297 100644
--- a/llvm/test/CodeGen/X86/basic-block-address-map-pgo-features.ll
+++ b/llvm/test/CodeGen/X86/basic-block-address-map-pgo-features.ll
@@ -69,7 +69,7 @@ declare i32 @__gxx_personality_v0(...)
 ; CHECK-LABEL:	.Lfunc_end0:
 
 ; CHECK: 	.section	.llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text._Z3bazb{{$}}
-; CHECK-NEXT:	.byte	3		# version
+; CHECK-NEXT:	.byte	4		# version
 ; BASIC-NEXT:	.byte	32		# feature
 ; PGO-ALL-NEXT:	.byte	39		# feature
 ; FEC-ONLY-NEXT:.byte	33		# feature
diff --git a/llvm/test/CodeGen/X86/basic-block-address-map-with-basic-block-sections.ll b/llvm/test/CodeGen/X86/basic-block-address-map-with-basic-block-sections.ll
index 6157f1a..aeb6dc95 100644
--- a/llvm/test/CodeGen/X86/basic-block-address-map-with-basic-block-sections.ll
+++ b/llvm/test/CodeGen/X86/basic-block-address-map-with-basic-block-sections.ll
@@ -47,7 +47,7 @@ declare i32 @__gxx_personality_v0(...)
 ; CHECK-LABEL:	.Lfunc_end0:
 
 ; CHECK:		.section	.llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text.hot._Z3bazb
-; CHECK-NEXT:   .byte   3                       # version
+; CHECK-NEXT:   .byte   4                       # version
 ; CHECK-NEXT:   .byte   40                      # feature
 ; CHECK-NEXT:   .byte   2                       # number of basic block ranges
 ; CHECK-NEXT:	.quad	.Lfunc_begin0           # base address
diff --git a/llvm/test/CodeGen/X86/basic-block-address-map-with-emit-bb-hash.ll b/llvm/test/CodeGen/X86/basic-block-address-map-with-emit-bb-hash.ll
new file mode 100644
index 0000000..a5678877
--- /dev/null
+++ b/llvm/test/CodeGen/X86/basic-block-address-map-with-emit-bb-hash.ll
@@ -0,0 +1,94 @@
+; Check the basic block sections labels option works when used along with -emit-bb-hash.
+; RUN: llc < %s -mtriple=x86_64 -function-sections -unique-section-names=true -basic-block-address-map -emit-bb-hash | FileCheck %s --check-prefixes=CHECK,UNIQ
+
+define void @_Z3bazb(i1 zeroext, i1 zeroext) personality ptr @__gxx_personality_v0 {
+  br i1 %0, label %3, label %8
+
+3:
+  %4 = invoke i32 @_Z3barv()
+          to label %8 unwind label %6
+  br label %10
+
+6:
+  landingpad { ptr, i32 }
+          catch ptr null
+  br label %12
+
+8:
+  %9 = call i32 @_Z3foov()
+  br i1 %1, label %12, label %10
+
+10:
+  %11 = select i1 %1, ptr blockaddress(@_Z3bazb, %3), ptr blockaddress(@_Z3bazb, %12) ; <ptr> [#uses=1]
+  indirectbr ptr %11, [label %3, label %12]
+
+12:
+  ret void
+}
+
+declare i32 @_Z3barv() #1
+
+declare i32 @_Z3foov() #1
+
+declare i32 @__gxx_personality_v0(...)
+
+; UNIQ:			.section .text._Z3bazb,"ax",@progbits{{$}}
+; NOUNIQ:		.section .text,"ax",@progbits,unique,1
+; CHECK-LABEL:	_Z3bazb:
+; CHECK-LABEL:	.Lfunc_begin0:
+; CHECK-LABEL:	.LBB_END0_0:
+; CHECK-LABEL:	.LBB0_1:
+; CHECK-LABEL:  .LBB0_1_CS0:
+; CHECK-LABEL:	.LBB_END0_1:
+; CHECK-LABEL:	.LBB0_2:
+; CHECK-LABEL:  .LBB0_2_CS0:
+; CHECK-LABEL:	.LBB_END0_2:
+; CHECK-LABEL:	.LBB0_3:
+; CHECK-LABEL:	.LBB_END0_3:
+; CHECK-LABEL:	.Lfunc_end0:
+
+; UNIQ:			.section	.llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text._Z3bazb{{$}}
+;; Verify that with -unique-section-names=false, the unique id of the text section gets assigned to the llvm_bb_addr_map section.
+; NOUNIQ:		.section	.llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text,unique,1
+; CHECK-NEXT:   .byte   4		# version
+; CHECK-NEXT:   .byte   96		# feature
+; CHECK-NEXT:	.quad	.Lfunc_begin0	# function address
+; CHECK-NEXT:	.byte	6		# number of basic blocks
+; CHECK-NEXT:   .byte	0		# BB id
+; CHECK-NEXT:	.uleb128 .Lfunc_begin0-.Lfunc_begin0
+; CHECK-NEXT:   .byte	0		# number of callsites
+; CHECK-NEXT:	.uleb128 .LBB_END0_0-.Lfunc_begin0
+; CHECK-NEXT:	.byte	8
+; CHECK-NEXT:   .quad	{{-?[0-9]+}}
+; CHECK-NEXT:   .byte	1		# BB id
+; CHECK-NEXT:	.uleb128 .LBB0_1-.LBB_END0_0
+; CHECK-NEXT:   .byte	1		# number of callsites
+; CHECK-NEXT:	.uleb128 .LBB0_1_CS0-.LBB0_1
+; CHECK-NEXT:	.uleb128 .LBB_END0_1-.LBB0_1_CS0
+; CHECK-NEXT:	.byte	8
+; CHECK-NEXT:   .quad	{{-?[0-9]+}}
+; CHECK-NEXT:   .byte	3		# BB id
+; CHECK-NEXT:	.uleb128 .LBB0_2-.LBB_END0_1
+; CHECK-NEXT:   .byte	1		# number of callsites
+; CHECK-NEXT:	.uleb128 .LBB0_2_CS0-.LBB0_2
+; CHECK-NEXT:	.uleb128 .LBB_END0_2-.LBB0_2_CS0
+; CHECK-NEXT:	.byte	8
+; CHECK-NEXT:   .quad	{{-?[0-9]+}}
+; CHECK-NEXT:   .byte	4		# BB id
+; CHECK-NEXT:	.uleb128 .LBB0_3-.LBB_END0_2
+; CHECK-NEXT:   .byte	0		# number of callsites
+; CHECK-NEXT:	.uleb128 .LBB_END0_3-.LBB0_3
+; CHECK-NEXT:	.byte	16
+; CHECK-NEXT:   .quad {{-?[0-9]+}}
+; CHECK-NEXT:   .byte	5		# BB id
+; CHECK-NEXT:	.uleb128 .LBB0_4-.LBB_END0_3
+; CHECK-NEXT:   .byte	0		# number of callsites
+; CHECK-NEXT:	.uleb128 .LBB_END0_4-.LBB0_4
+; CHECK-NEXT:	.byte	1
+; CHECK-NEXT:   .quad {{-?[0-9]+}}
+; CHECK-NEXT:   .byte	2		# BB id
+; CHECK-NEXT:	.uleb128 .LBB0_5-.LBB_END0_4
+; CHECK-NEXT:   .byte	0		# number of callsites
+; CHECK-NEXT:	.uleb128 .LBB_END0_5-.LBB0_5
+; CHECK-NEXT:	.byte	5
+; CHECK-NEXT:   .quad {{-?[0-9]+}}
diff --git a/llvm/test/CodeGen/X86/basic-block-address-map-with-mfs.ll b/llvm/test/CodeGen/X86/basic-block-address-map-with-mfs.ll
index 1e8cee4..d49b313 100644
--- a/llvm/test/CodeGen/X86/basic-block-address-map-with-mfs.ll
+++ b/llvm/test/CodeGen/X86/basic-block-address-map-with-mfs.ll
@@ -58,7 +58,7 @@ declare i32 @qux()
 ; CHECK-LABEL:  .Lfunc_end0:
 
 ; CHECK:                .section        .llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text.hot.foo
-; CHECK-NEXT:   .byte   3               # version
+; CHECK-NEXT:   .byte   4               # version
 ; BASIC-NEXT:   .byte   40              # feature
 ; PGO-NEXT:     .byte   47              # feature
 ; CHECK-NEXT:   .byte   2               # number of basic block ranges
diff --git a/llvm/test/CodeGen/X86/basic-block-address-map.ll b/llvm/test/CodeGen/X86/basic-block-address-map.ll
index 5c8f3a6..64cf2c7 100644
--- a/llvm/test/CodeGen/X86/basic-block-address-map.ll
+++ b/llvm/test/CodeGen/X86/basic-block-address-map.ll
@@ -52,7 +52,7 @@ declare i32 @__gxx_personality_v0(...)
 ; UNIQ:			.section	.llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text._Z3bazb{{$}}
 ;; Verify that with -unique-section-names=false, the unique id of the text section gets assigned to the llvm_bb_addr_map section.
 ; NOUNIQ:		.section	.llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text,unique,1
-; CHECK-NEXT:   .byte   3		# version
+; CHECK-NEXT:   .byte   4		# version
 ; CHECK-NEXT:   .byte   32		# feature
 ; CHECK-NEXT:	.quad	.Lfunc_begin0	# function address
 ; CHECK-NEXT:	.byte	6		# number of basic blocks
diff --git a/llvm/test/CodeGen/X86/bit-piece-comment.ll b/llvm/test/CodeGen/X86/bit-piece-comment.ll
index d74863f..85c64a7 100644
--- a/llvm/test/CodeGen/X86/bit-piece-comment.ll
+++ b/llvm/test/CodeGen/X86/bit-piece-comment.ll
@@ -32,7 +32,7 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
 
-attributes #0 = { norecurse nounwind optsize readnone uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind optsize readnone uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 
 !llvm.dbg.cu = !{!0}
diff --git a/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
index 632d90d..f36baba 100644
--- a/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
+++ b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
@@ -27,7 +27,7 @@ entry:
 !1 = !{i64 0, !"_ZTSFivE.generalized"}
 !2 = !{i64 0, !"_ZTSFviE.generalized"}
 
-; CHECK: .section .llvm.callgraph,"o",@progbits,.text
+; CHECK: .section .llvm.callgraph,"o",@llvm_call_graph,.text
 ;; Version
 ; CHECK-NEXT: .byte   0
 ;; Flags -- Potential indirect target so LSB is set to 1. Other bits are 0.
diff --git a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
index ed6849a..cdbad66 100644
--- a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
+++ b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
@@ -36,7 +36,7 @@ entry:
 !4 = !{!5}
 !5 = !{i64 0, !"_ZTSFPvS_E.generalized"}
 
-; CHECK: .section .llvm.callgraph,"o",@progbits,.text
+; CHECK: .section .llvm.callgraph,"o",@llvm_call_graph,.text
 ;; Version
 ; CHECK-NEXT: .byte   0
 ;; Flags
diff --git a/llvm/test/CodeGen/X86/catchpad-regmask.ll b/llvm/test/CodeGen/X86/catchpad-regmask.ll
index 9dba897..713d015 100644
--- a/llvm/test/CodeGen/X86/catchpad-regmask.ll
+++ b/llvm/test/CodeGen/X86/catchpad-regmask.ll
@@ -130,7 +130,7 @@ unreachable:                                      ; preds = %entry
 ; CHECK: retq                            # CATCHRET
 
 
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
 attributes #1 = { noreturn }
 
 !llvm.module.flags = !{!0}
diff --git a/llvm/test/CodeGen/X86/catchpad-weight.ll b/llvm/test/CodeGen/X86/catchpad-weight.ll
index e97f358..699243d 100644
--- a/llvm/test/CodeGen/X86/catchpad-weight.ll
+++ b/llvm/test/CodeGen/X86/catchpad-weight.ll
@@ -74,8 +74,8 @@ declare void @"\01??1HasDtor@@QEAA@XZ"(ptr) #3
 ; Function Attrs: nounwind argmemonly
 declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #1
 
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "use-soft-float"="false" }
 attributes #1 = { nounwind argmemonly }
-attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "use-soft-float"="false" }
+attributes #3 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "use-soft-float"="false" }
 attributes #4 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/clang-section-coff.ll b/llvm/test/CodeGen/X86/clang-section-coff.ll
index 02381fd..6b76bb6 100644
--- a/llvm/test/CodeGen/X86/clang-section-coff.ll
+++ b/llvm/test/CodeGen/X86/clang-section-coff.ll
@@ -37,8 +37,8 @@ attributes #0 = { "bss-section"="my_bss.1" "data-section"="my_data.1" "rodata-se
 attributes #1 = { "data-section"="my_data.1" "rodata-section"="my_rodata.1" }
 attributes #2 = { "bss-section"="my_bss.2" "rodata-section"="my_rodata.1" }
 attributes #3 = { "bss-section"="my_bss.2" "data-section"="my_data.2" "rodata-section"="my_rodata.2" }
-attributes #6 = { "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #7 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #7 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 !llvm.module.flags = !{!0, !1, !2, !3}
 
diff --git a/llvm/test/CodeGen/X86/cleanuppad-inalloca.ll b/llvm/test/CodeGen/X86/cleanuppad-inalloca.ll
index 01e7019..863f580 100644
--- a/llvm/test/CodeGen/X86/cleanuppad-inalloca.ll
+++ b/llvm/test/CodeGen/X86/cleanuppad-inalloca.ll
@@ -65,4 +65,4 @@ declare i32 @__CxxFrameHandler3(...)
 
 declare x86_thiscallcc void @"\01??1A@@QAE@XZ"(ptr) #0
 
-attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/combine-adc.ll b/llvm/test/CodeGen/X86/combine-adc.ll
index 2241736..a2aaea3 100644
--- a/llvm/test/CodeGen/X86/combine-adc.ll
+++ b/llvm/test/CodeGen/X86/combine-adc.ll
@@ -89,4 +89,52 @@ define i32 @adc_merge_constants(i32 %a0) nounwind {
   ret i32 %sum
 }
 
+define i32 @adc_merge_sub(i32 %a0) nounwind {
+; X86-LABEL: adc_merge_sub:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    addl $42, %edi
+; X86-NEXT:    setb %al
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    negl %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll use@PLT
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    xorl %edi, %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl
+;
+; X64-LABEL: adc_merge_sub:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl %edi, %ebx
+; X64-NEXT:    xorl %edi, %edi
+; X64-NEXT:    addl $42, %ebx
+; X64-NEXT:    setb %dil
+; X64-NEXT:    movl %ebx, %ebp
+; X64-NEXT:    negl %ebp
+; X64-NEXT:    callq use@PLT
+; X64-NEXT:    xorl %ebx, %ebp
+; X64-NEXT:    movl %ebp, %eax
+; X64-NEXT:    addq $8, %rsp
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    retq
+  %adc = tail call { i8, i32 } @llvm.x86.addcarry.32(i8 0, i32 %a0, i32 42)
+  %carry = extractvalue { i8, i32 } %adc, 0
+  call void @use(i8 %carry)
+  %sum = extractvalue { i8, i32 } %adc, 1
+  %sub = sub i32 -42, %a0
+  %result = xor i32 %sum, %sub
+  ret i32 %result
+}
+
 declare { i8, i32 } @llvm.x86.addcarry.32(i8, i32, i32)
+declare void @use(i8)
diff --git a/llvm/test/CodeGen/X86/combine-sbb.ll b/llvm/test/CodeGen/X86/combine-sbb.ll
index 89aee96..62744d4 100644
--- a/llvm/test/CodeGen/X86/combine-sbb.ll
+++ b/llvm/test/CodeGen/X86/combine-sbb.ll
@@ -333,4 +333,85 @@ define i32 @PR40483_sub6(ptr, i32) nounwind {
   ret i32 %10
 }
 
+define i32 @sbb_merge_add1(i32 %a0) nounwind {
+; X86-LABEL: sbb_merge_add1:
+; X86:       # %bb.0:
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl $42, {{[0-9]+}}(%esp)
+; X86-NEXT:    setb %al
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll use@PLT
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: sbb_merge_add1:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpl $42, %edi
+; X64-NEXT:    setb %al
+; X64-NEXT:    movl %eax, %edi
+; X64-NEXT:    callq use@PLT
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %sbb = tail call { i8, i32 } @llvm.x86.subborrow.32(i8 0, i32 %a0, i32 42)
+  %borrow = extractvalue { i8, i32 } %sbb, 0
+  call void @use(i8 %borrow)
+  %diff = extractvalue { i8, i32 } %sbb, 1
+  %add = add i32 %a0, -42
+  %result = xor i32 %diff, %add
+  ret i32 %result
+}
+
+define i32 @sbb_merge_add2(i32 %a0) nounwind {
+; X86-LABEL: sbb_merge_add2:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl $42, %edi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    setb %al
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    negl %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll use@PLT
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    xorl %edi, %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl
+;
+; X64-LABEL: sbb_merge_add2:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl $42, %ebp
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    subl %edi, %ebp
+; X64-NEXT:    setb %al
+; X64-NEXT:    movl %ebp, %ebx
+; X64-NEXT:    negl %ebx
+; X64-NEXT:    movl %eax, %edi
+; X64-NEXT:    callq use@PLT
+; X64-NEXT:    xorl %ebp, %ebx
+; X64-NEXT:    movl %ebx, %eax
+; X64-NEXT:    addq $8, %rsp
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    retq
+  %sbb = tail call { i8, i32 } @llvm.x86.subborrow.32(i8 0, i32 42, i32 %a0)
+  %borrow = extractvalue { i8, i32 } %sbb, 0
+  call void @use(i8 %borrow)
+  %diff = extractvalue { i8, i32 } %sbb, 1
+  %add = add i32 %a0, -42
+  %result = xor i32 %diff, %add
+  ret i32 %result
+}
+
 declare { i8, i32 } @llvm.x86.subborrow.32(i8, i32, i32)
+declare void @use(i8)
diff --git a/llvm/test/CodeGen/X86/complex-fastmath.ll b/llvm/test/CodeGen/X86/complex-fastmath.ll
index 29a37a1..21bb64a 100644
--- a/llvm/test/CodeGen/X86/complex-fastmath.ll
+++ b/llvm/test/CodeGen/X86/complex-fastmath.ll
@@ -212,4 +212,4 @@ define <2 x double> @complex_mul_f64(<2 x double>, <2 x double>) #0 {
   ret <2 x double> %14
 }
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "unsafe-fp-math"="true"  }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true"  }
diff --git a/llvm/test/CodeGen/X86/crash-lre-eliminate-dead-def.ll b/llvm/test/CodeGen/X86/crash-lre-eliminate-dead-def.ll
index ddddcfa..9f51fa4 100644
--- a/llvm/test/CodeGen/X86/crash-lre-eliminate-dead-def.ll
+++ b/llvm/test/CodeGen/X86/crash-lre-eliminate-dead-def.ll
@@ -264,5 +264,5 @@ unreachable:                                      ; preds = %cleanup100
 ; Function Attrs: nounwind
 declare void @printf(ptr nocapture readonly, ...) #1
 
-attributes #0 = { noreturn nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noreturn nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/dag-optnone.ll b/llvm/test/CodeGen/X86/dag-optnone.ll
index 66e4c1d..022694e 100644
--- a/llvm/test/CodeGen/X86/dag-optnone.ll
+++ b/llvm/test/CodeGen/X86/dag-optnone.ll
@@ -28,7 +28,7 @@
 ; a repeated fadd that can be combined into an fmul.  We show that this
 ; happens in both the non-optnone function and the optnone function.
 
-define float @foo(float %x, ...) #0 {
+define float @foo(float %x, ...) {
 entry:
   %add = fadd fast float %x, %x
   %add1 = fadd fast float %add, %x
@@ -68,5 +68,4 @@ entry:
   ret void
 }
 
-attributes #0 = { "unsafe-fp-math"="true" }
-attributes #1 = { noinline optnone "unsafe-fp-math"="true" }
+attributes #1 = { noinline optnone }
diff --git a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
index b428ce4..71ad598 100644
--- a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
+++ b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
@@ -96,6 +96,17 @@ entry:
 define void @_Z2x6v() local_unnamed_addr {
 ; CHECK-LABEL: _Z2x6v:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq x1@GOTPCREL(%rip), %rax
+; CHECK-NEXT:    movl (%rax), %edx
+; CHECK-NEXT:    andl $511, %edx # imm = 0x1FF
+; CHECK-NEXT:    leaq 1(%rdx), %rax
+; CHECK-NEXT:    movq x4@GOTPCREL(%rip), %rcx
+; CHECK-NEXT:    movl %eax, (%rcx)
+; CHECK-NEXT:    movq x3@GOTPCREL(%rip), %rcx
+; CHECK-NEXT:    movl (%rcx), %ecx
+; CHECK-NEXT:    testl %ecx, %ecx
+; CHECK-NEXT:    je .LBB1_18
+; CHECK-NEXT:  # %bb.1: # %for.cond1thread-pre-split.lr.ph
 ; CHECK-NEXT:    pushq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    pushq %r15
@@ -114,58 +125,47 @@ define void @_Z2x6v() local_unnamed_addr {
 ; CHECK-NEXT:    .cfi_offset %r14, -32
 ; CHECK-NEXT:    .cfi_offset %r15, -24
 ; CHECK-NEXT:    .cfi_offset %rbp, -16
-; CHECK-NEXT:    movq x1@GOTPCREL(%rip), %rax
-; CHECK-NEXT:    movl (%rax), %ebx
-; CHECK-NEXT:    andl $511, %ebx # imm = 0x1FF
-; CHECK-NEXT:    leaq 1(%rbx), %rax
-; CHECK-NEXT:    movq x4@GOTPCREL(%rip), %rcx
-; CHECK-NEXT:    movl %eax, (%rcx)
-; CHECK-NEXT:    movq x3@GOTPCREL(%rip), %rcx
-; CHECK-NEXT:    movl (%rcx), %ecx
-; CHECK-NEXT:    testl %ecx, %ecx
-; CHECK-NEXT:    je .LBB1_18
-; CHECK-NEXT:  # %bb.1: # %for.cond1thread-pre-split.lr.ph
-; CHECK-NEXT:    movq x5@GOTPCREL(%rip), %rdx
-; CHECK-NEXT:    movq (%rdx), %rsi
-; CHECK-NEXT:    movl %ecx, %edx
-; CHECK-NEXT:    notl %edx
-; CHECK-NEXT:    leaq 8(,%rdx,8), %rdi
+; CHECK-NEXT:    movq x5@GOTPCREL(%rip), %rsi
+; CHECK-NEXT:    movq (%rsi), %rsi
+; CHECK-NEXT:    movl %ecx, %edi
+; CHECK-NEXT:    notl %edi
+; CHECK-NEXT:    leaq 8(,%rdi,8), %rdi
 ; CHECK-NEXT:    imulq %rax, %rdi
 ; CHECK-NEXT:    addq %rsi, %rdi
 ; CHECK-NEXT:    movq x2@GOTPCREL(%rip), %r8
-; CHECK-NEXT:    movl (%r8), %edx
-; CHECK-NEXT:    leal 8(,%rbx,8), %eax
+; CHECK-NEXT:    movl (%r8), %r9d
+; CHECK-NEXT:    leal 8(,%rdx,8), %eax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    leaq 32(%rsi), %r11
-; CHECK-NEXT:    leaq 8(,%rbx,8), %rbx
-; CHECK-NEXT:    xorl %r14d, %r14d
-; CHECK-NEXT:    movq x0@GOTPCREL(%rip), %r15
-; CHECK-NEXT:    movq %rsi, %r12
+; CHECK-NEXT:    leaq 32(%rsi), %rbx
+; CHECK-NEXT:    leaq 8(,%rdx,8), %r14
+; CHECK-NEXT:    xorl %r15d, %r15d
+; CHECK-NEXT:    movq x0@GOTPCREL(%rip), %r12
+; CHECK-NEXT:    movq %rsi, %r13
 ; CHECK-NEXT:    jmp .LBB1_2
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB1_15: # %for.cond1.for.inc3_crit_edge
 ; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT:    movl %edx, (%r8)
+; CHECK-NEXT:    movl %r9d, (%r8)
 ; CHECK-NEXT:  .LBB1_16: # %for.inc3
 ; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT:    addq %rbx, %r12
-; CHECK-NEXT:    incq %r14
-; CHECK-NEXT:    addq %rbx, %r11
+; CHECK-NEXT:    addq %r14, %r13
+; CHECK-NEXT:    incq %r15
+; CHECK-NEXT:    addq %r14, %rbx
 ; CHECK-NEXT:    incl %ecx
 ; CHECK-NEXT:    je .LBB1_17
 ; CHECK-NEXT:  .LBB1_2: # %for.cond1thread-pre-split
 ; CHECK-NEXT:    # =>This Loop Header: Depth=1
 ; CHECK-NEXT:    # Child Loop BB1_12 Depth 2
 ; CHECK-NEXT:    # Child Loop BB1_14 Depth 2
-; CHECK-NEXT:    testl %edx, %edx
+; CHECK-NEXT:    testl %r9d, %r9d
 ; CHECK-NEXT:    jns .LBB1_16
 ; CHECK-NEXT:  # %bb.3: # %for.body2.preheader
 ; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT:    movslq %edx, %r13
-; CHECK-NEXT:    testq %r13, %r13
+; CHECK-NEXT:    movslq %r9d, %r9
+; CHECK-NEXT:    testq %r9, %r9
 ; CHECK-NEXT:    movq $-1, %rbp
-; CHECK-NEXT:    cmovnsq %r13, %rbp
-; CHECK-NEXT:    subq %r13, %rbp
+; CHECK-NEXT:    cmovnsq %r9, %rbp
+; CHECK-NEXT:    subq %r9, %rbp
 ; CHECK-NEXT:    incq %rbp
 ; CHECK-NEXT:    cmpq $4, %rbp
 ; CHECK-NEXT:    jb .LBB1_14
@@ -177,20 +177,20 @@ define void @_Z2x6v() local_unnamed_addr {
 ; CHECK-NEXT:  # %bb.5: # %vector.memcheck
 ; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
 ; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; CHECK-NEXT:    imulq %r14, %rax
-; CHECK-NEXT:    leaq (%rsi,%rax), %r10
-; CHECK-NEXT:    leaq (%r10,%r13,8), %r9
-; CHECK-NEXT:    testq %r13, %r13
-; CHECK-NEXT:    movq $-1, %r10
-; CHECK-NEXT:    cmovnsq %r13, %r10
-; CHECK-NEXT:    cmpq %r15, %r9
+; CHECK-NEXT:    imulq %r15, %rax
+; CHECK-NEXT:    leaq (%rsi,%rax), %r11
+; CHECK-NEXT:    leaq (%r11,%r9,8), %r10
+; CHECK-NEXT:    testq %r9, %r9
+; CHECK-NEXT:    movq $-1, %r11
+; CHECK-NEXT:    cmovnsq %r9, %r11
+; CHECK-NEXT:    cmpq %r12, %r10
 ; CHECK-NEXT:    jae .LBB1_7
 ; CHECK-NEXT:  # %bb.6: # %vector.memcheck
 ; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT:    leaq 8(%rsi), %r9
-; CHECK-NEXT:    addq %r9, %rax
-; CHECK-NEXT:    leaq (%rax,%r10,8), %rax
-; CHECK-NEXT:    cmpq %r15, %rax
+; CHECK-NEXT:    leaq 8(%rsi), %r10
+; CHECK-NEXT:    addq %r10, %rax
+; CHECK-NEXT:    leaq (%rax,%r11,8), %rax
+; CHECK-NEXT:    cmpq %r12, %rax
 ; CHECK-NEXT:    ja .LBB1_14
 ; CHECK-NEXT:  .LBB1_7: # %vector.body.preheader
 ; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
@@ -201,50 +201,47 @@ define void @_Z2x6v() local_unnamed_addr {
 ; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
 ; CHECK-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; CHECK-NEXT:    movdqu %xmm0, (%r12,%r13,8)
-; CHECK-NEXT:    movdqu %xmm0, 16(%r12,%r13,8)
-; CHECK-NEXT:    movl $4, %r10d
+; CHECK-NEXT:    movdqu %xmm0, (%r13,%r9,8)
+; CHECK-NEXT:    movdqu %xmm0, 16(%r13,%r9,8)
+; CHECK-NEXT:    movl $4, %r11d
 ; CHECK-NEXT:    shrq $2, %rax
 ; CHECK-NEXT:    jne .LBB1_11
 ; CHECK-NEXT:    jmp .LBB1_13
 ; CHECK-NEXT:  .LBB1_8: # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT:    xorl %r10d, %r10d
+; CHECK-NEXT:    xorl %r11d, %r11d
 ; CHECK-NEXT:    shrq $2, %rax
 ; CHECK-NEXT:    je .LBB1_13
 ; CHECK-NEXT:  .LBB1_11: # %vector.body.preheader.new
 ; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
 ; CHECK-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; CHECK-NEXT:    movq %r10, %rax
+; CHECK-NEXT:    movq %r11, %rax
 ; CHECK-NEXT:    subq %rdx, %rax
-; CHECK-NEXT:    addq %r13, %r10
-; CHECK-NEXT:    leaq (%r11,%r10,8), %r10
+; CHECK-NEXT:    addq %r9, %r11
+; CHECK-NEXT:    leaq (%rbx,%r11,8), %r11
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB1_12: # %vector.body
 ; CHECK-NEXT:    # Parent Loop BB1_2 Depth=1
 ; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    movdqu %xmm0, -32(%r10)
-; CHECK-NEXT:    movdqu %xmm0, -16(%r10)
-; CHECK-NEXT:    movdqu %xmm0, (%r10)
-; CHECK-NEXT:    movdqu %xmm0, 16(%r10)
-; CHECK-NEXT:    addq $64, %r10
+; CHECK-NEXT:    movdqu %xmm0, -32(%r11)
+; CHECK-NEXT:    movdqu %xmm0, -16(%r11)
+; CHECK-NEXT:    movdqu %xmm0, (%r11)
+; CHECK-NEXT:    movdqu %xmm0, 16(%r11)
+; CHECK-NEXT:    addq $64, %r11
 ; CHECK-NEXT:    addq $8, %rax
 ; CHECK-NEXT:    jne .LBB1_12
 ; CHECK-NEXT:  .LBB1_13: # %middle.block
 ; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT:    addq %rdx, %r13
+; CHECK-NEXT:    addq %rdx, %r9
 ; CHECK-NEXT:    cmpq %rdx, %rbp
-; CHECK-NEXT:    movq %r13, %rdx
 ; CHECK-NEXT:    je .LBB1_15
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB1_14: # %for.body2
 ; CHECK-NEXT:    # Parent Loop BB1_2 Depth=1
 ; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    movq (%r15), %rax
-; CHECK-NEXT:    movq %rax, (%r12,%r13,8)
-; CHECK-NEXT:    leaq 1(%r13), %rdx
-; CHECK-NEXT:    cmpq $-1, %r13
-; CHECK-NEXT:    movq %rdx, %r13
+; CHECK-NEXT:    movq (%r12), %rax
+; CHECK-NEXT:    movq %rax, (%r13,%r9,8)
+; CHECK-NEXT:    incq %r9
 ; CHECK-NEXT:    jl .LBB1_14
 ; CHECK-NEXT:    jmp .LBB1_15
 ; CHECK-NEXT:  .LBB1_17: # %for.cond.for.end5_crit_edge
@@ -252,7 +249,6 @@ define void @_Z2x6v() local_unnamed_addr {
 ; CHECK-NEXT:    movq %rdi, (%rax)
 ; CHECK-NEXT:    movq x3@GOTPCREL(%rip), %rax
 ; CHECK-NEXT:    movl $0, (%rax)
-; CHECK-NEXT:  .LBB1_18: # %for.end5
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
 ; CHECK-NEXT:    popq %r12
@@ -265,6 +261,13 @@ define void @_Z2x6v() local_unnamed_addr {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    .cfi_restore %rbx
+; CHECK-NEXT:    .cfi_restore %r12
+; CHECK-NEXT:    .cfi_restore %r13
+; CHECK-NEXT:    .cfi_restore %r14
+; CHECK-NEXT:    .cfi_restore %r15
+; CHECK-NEXT:    .cfi_restore %rbp
+; CHECK-NEXT:  .LBB1_18: # %for.end5
 ; CHECK-NEXT:    retq
 entry:
   %0 = load i32, ptr @x1, align 4
diff --git a/llvm/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll b/llvm/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
index deba5a8..18e5490 100644
--- a/llvm/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
+++ b/llvm/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
@@ -112,9 +112,9 @@ declare void @_Z3fooPcjPKc(ptr, i32, ptr) #2
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #3
 
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
 attributes #1 = { argmemonly nounwind }
-attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
 attributes #3 = { nounwind readnone }
 attributes #4 = { nounwind }
 
diff --git a/llvm/test/CodeGen/X86/dbg-changes-codegen.ll b/llvm/test/CodeGen/X86/dbg-changes-codegen.ll
index fabdbbb..c688895 100644
--- a/llvm/test/CodeGen/X86/dbg-changes-codegen.ll
+++ b/llvm/test/CodeGen/X86/dbg-changes-codegen.ll
@@ -68,8 +68,8 @@ _ZN7Flibble3barEP6Wibble.exit:                    ; preds = %entry, %if.then.i
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
 
-attributes #0 = { nounwind readonly uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readonly uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
 
 !1 = distinct !DISubprogram()
diff --git a/llvm/test/CodeGen/X86/dbg-combine.ll b/llvm/test/CodeGen/X86/dbg-combine.ll
index b3d2213..3ff5a26 100644
--- a/llvm/test/CodeGen/X86/dbg-combine.ll
+++ b/llvm/test/CodeGen/X86/dbg-combine.ll
@@ -63,7 +63,7 @@ declare ptr @llvm.stacksave() #2
 ; Function Attrs: nounwind
 declare void @llvm.stackrestore(ptr) #2
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/X86/debug-loclists-lto.ll b/llvm/test/CodeGen/X86/debug-loclists-lto.ll
index fde8e00..2bd927f 100644
--- a/llvm/test/CodeGen/X86/debug-loclists-lto.ll
+++ b/llvm/test/CodeGen/X86/debug-loclists-lto.ll
@@ -34,8 +34,8 @@ entry:
   ret void, !dbg !29
 }
 
-attributes #0 = { uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone speculatable willreturn }
 
 !llvm.dbg.cu = !{!0, !7}
diff --git a/llvm/test/CodeGen/X86/debugloc-argsize.ll b/llvm/test/CodeGen/X86/debugloc-argsize.ll
index 3cfeb6e..f4527c5 100644
--- a/llvm/test/CodeGen/X86/debugloc-argsize.ll
+++ b/llvm/test/CodeGen/X86/debugloc-argsize.ll
@@ -30,7 +30,7 @@ declare ptr @__cxa_begin_catch(ptr)
 
 declare void @__cxa_end_catch()
 
-attributes #0 = { optsize "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { optsize "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+sse,+sse2" "use-soft-float"="false" }
 attributes #1 = { optsize }
 attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/X86/early-cfi-sections.ll b/llvm/test/CodeGen/X86/early-cfi-sections.ll
index 3a9e62a..8ab0340 100644
--- a/llvm/test/CodeGen/X86/early-cfi-sections.ll
+++ b/llvm/test/CodeGen/X86/early-cfi-sections.ll
@@ -12,7 +12,7 @@ entry:
   ret void, !dbg !8
 }
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
diff --git a/llvm/test/CodeGen/X86/fadd-combines.ll b/llvm/test/CodeGen/X86/fadd-combines.ll
index 2c06c53..a44671c 100644
--- a/llvm/test/CodeGen/X86/fadd-combines.ll
+++ b/llvm/test/CodeGen/X86/fadd-combines.ll
@@ -275,4 +275,4 @@ define <2 x double> @fmul2_negated_vec(<2 x double> %a, <2 x double> %b, <2 x do
   ret <2 x double> %sub
 }
 
-attributes #0 = { "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" }
+attributes #0 = { "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" }
diff --git a/llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll b/llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll
index 5afa12c..1bc94b1 100644
--- a/llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll
+++ b/llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll
@@ -65,5 +65,5 @@ entry:
 declare i16 @llvm.convert.to.fp16.f64(double)
 declare i16 @llvm.convert.to.fp16.f80(x86_fp80)
 
-attributes #0 = { nounwind readnone "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readnone "use-soft-float"="false" }
+attributes #1 = { nounwind readnone "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/fdiv.ll b/llvm/test/CodeGen/X86/fdiv.ll
index 67bad09..859f54e 100644
--- a/llvm/test/CodeGen/X86/fdiv.ll
+++ b/llvm/test/CodeGen/X86/fdiv.ll
@@ -54,7 +54,7 @@ define double @denormal2(double %x) {
 
 ; Deleting the negates does not require unsafe-fp-math.
 
-define float @double_negative(float %x, float %y) #0 {
+define float @double_negative(float %x, float %y) {
 ; CHECK-LABEL: double_negative:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    divss %xmm1, %xmm0
@@ -65,7 +65,7 @@ define float @double_negative(float %x, float %y) #0 {
   ret float %div
 }
 
-define <4 x float> @double_negative_vector(<4 x float> %x, <4 x float> %y) #0 {
+define <4 x float> @double_negative_vector(<4 x float> %x, <4 x float> %y) {
 ; CHECK-LABEL: double_negative_vector:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    divps %xmm1, %xmm0
@@ -80,7 +80,7 @@ define <4 x float> @double_negative_vector(<4 x float> %x, <4 x float> %y) #0 {
 ; clang/gcc), due to order of argument evaluation not being well defined. We
 ; ended up hitting llvm_unreachable in getNegatedExpression when building with
 ; gcc. Just make sure that we get a deterministic result.
-define float @fdiv_fneg_combine(float %a0, float %a1, float %a2) #0 {
+define float @fdiv_fneg_combine(float %a0, float %a1, float %a2) {
 ; CHECK-LABEL: fdiv_fneg_combine:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm0, %xmm3
@@ -99,6 +99,3 @@ define float @fdiv_fneg_combine(float %a0, float %a1, float %a2) #0 {
   %div5 = fdiv fast float %mul2, %sub4
   ret float %div5
 }
-
-attributes #0 = { "unsafe-fp-math"="false" }
-
diff --git a/llvm/test/CodeGen/X86/fma_patterns_wide.ll b/llvm/test/CodeGen/X86/fma_patterns_wide.ll
index 4c16cf9..0c3ec8d 100644
--- a/llvm/test/CodeGen/X86/fma_patterns_wide.ll
+++ b/llvm/test/CodeGen/X86/fma_patterns_wide.ll
@@ -1021,7 +1021,7 @@ define <8 x double> @test_v8f64_interp_ninf(<8 x double> %x, <8 x double> %y, <8
 ; Pattern: (fneg (fma x, y, z)) -> (fma x, -y, -z)
 ;
 
-define <16 x float> @test_v16f32_fneg_fmadd(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) #0 {
+define <16 x float> @test_v16f32_fneg_fmadd(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
 ; FMA-LABEL: test_v16f32_fneg_fmadd:
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vfnmsub213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) - ymm4
@@ -1044,7 +1044,7 @@ define <16 x float> @test_v16f32_fneg_fmadd(<16 x float> %a0, <16 x float> %a1,
   ret <16 x float> %neg
 }
 
-define <8 x double> @test_v8f64_fneg_fmsub(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) #0 {
+define <8 x double> @test_v8f64_fneg_fmsub(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
 ; FMA-LABEL: test_v8f64_fneg_fmsub:
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vfnmadd213pd {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm4
@@ -1067,7 +1067,7 @@ define <8 x double> @test_v8f64_fneg_fmsub(<8 x double> %a0, <8 x double> %a1, <
   ret <8 x double> %neg
 }
 
-define <16 x float> @test_v16f32_fneg_fnmadd(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) #0 {
+define <16 x float> @test_v16f32_fneg_fnmadd(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
 ; FMA-LABEL: test_v16f32_fneg_fnmadd:
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm4
@@ -1091,7 +1091,7 @@ define <16 x float> @test_v16f32_fneg_fnmadd(<16 x float> %a0, <16 x float> %a1,
   ret <16 x float> %neg1
 }
 
-define <8 x double> @test_v8f64_fneg_fnmsub(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) #0 {
+define <8 x double> @test_v8f64_fneg_fnmsub(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
 ; FMA-LABEL: test_v8f64_fneg_fnmsub:
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm4
@@ -1119,7 +1119,7 @@ define <8 x double> @test_v8f64_fneg_fnmsub(<8 x double> %a0, <8 x double> %a1,
 ; Pattern: (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2)
 ;
 
-define <16 x float> @test_v16f32_fma_x_c1_fmul_x_c2(<16 x float> %x) #0 {
+define <16 x float> @test_v16f32_fma_x_c1_fmul_x_c2(<16 x float> %x) {
 ; FMA-LABEL: test_v16f32_fma_x_c1_fmul_x_c2:
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
@@ -1146,7 +1146,7 @@ define <16 x float> @test_v16f32_fma_x_c1_fmul_x_c2(<16 x float> %x) #0 {
 ; Pattern: (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y)
 ;
 
-define <16 x float> @test_v16f32_fma_fmul_x_c1_c2_y(<16 x float> %x, <16 x float> %y) #0 {
+define <16 x float> @test_v16f32_fma_fmul_x_c1_c2_y(<16 x float> %x, <16 x float> %y) {
 ; FMA-LABEL: test_v16f32_fma_fmul_x_c1_c2_y:
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * mem) + ymm2
@@ -1171,7 +1171,7 @@ define <16 x float> @test_v16f32_fma_fmul_x_c1_c2_y(<16 x float> %x, <16 x float
 
 ; Pattern: (fneg (fmul x, y)) -> (fnmsub x, y, 0)
 
-define <16 x float> @test_v16f32_fneg_fmul(<16 x float> %x, <16 x float> %y) #0 {
+define <16 x float> @test_v16f32_fneg_fmul(<16 x float> %x, <16 x float> %y) {
 ; FMA-LABEL: test_v16f32_fneg_fmul:
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vxorps %xmm4, %xmm4, %xmm4
@@ -1196,7 +1196,7 @@ define <16 x float> @test_v16f32_fneg_fmul(<16 x float> %x, <16 x float> %y) #0
   ret <16 x float> %n
 }
 
-define <8 x double> @test_v8f64_fneg_fmul(<8 x double> %x, <8 x double> %y) #0 {
+define <8 x double> @test_v8f64_fneg_fmul(<8 x double> %x, <8 x double> %y) {
 ; FMA-LABEL: test_v8f64_fneg_fmul:
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
@@ -1221,7 +1221,7 @@ define <8 x double> @test_v8f64_fneg_fmul(<8 x double> %x, <8 x double> %y) #0 {
   ret <8 x double> %n
 }
 
-define <8 x double> @test_v8f64_fneg_fmul_no_nsz(<8 x double> %x, <8 x double> %y) #0 {
+define <8 x double> @test_v8f64_fneg_fmul_no_nsz(<8 x double> %x, <8 x double> %y) {
 ; FMA-LABEL: test_v8f64_fneg_fmul_no_nsz:
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
@@ -1250,7 +1250,6 @@ define <8 x double> @test_v8f64_fneg_fmul_no_nsz(<8 x double> %x, <8 x double> %
   ret <8 x double> %n
 }
 
-attributes #0 = { "unsafe-fp-math"="true" }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; AVX512-INFS: {{.*}}
 ; FMA-INFS: {{.*}}
diff --git a/llvm/test/CodeGen/X86/fold-tied-op.ll b/llvm/test/CodeGen/X86/fold-tied-op.ll
index 5ea2964..d60d397 100644
--- a/llvm/test/CodeGen/X86/fold-tied-op.ll
+++ b/llvm/test/CodeGen/X86/fold-tied-op.ll
@@ -158,7 +158,7 @@ if.end:                                           ; preds = %if.else, %if.then
   ret i64 undef
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll b/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll
index bd32430..fc5279d0 100644
--- a/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll
+++ b/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll
@@ -100,40 +100,52 @@ entry:
   ret i32 %result
 }
 
-; These 2 divs only differ in their exception behavior and will be CSEd. Make
-; sure the nofpexcept flag is not set on the combined node.
+; These 4 divs only differ in their exception behavior. They form two groups,
+; whithin each the constrained functions have the same exception hehavior and
+; may be CSE'd. Instructions with different exception behavior belong to
+; different groups, they have different chain argument and cannot be CSE'd.
 define void @binop_cse(double %a, double %b, ptr %x, ptr %y) #0 {
 entry:
 ; CHECK-LABEL: name: binop_cse
-; CHECK: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.0)
-; CHECK: [[MOV32rm1:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.1, align 16)
-; CHECK: [[MOVSDrm_alt:%[0-9]+]]:fr64 = MOVSDrm_alt %fixed-stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.3, align 16)
-; CHECK: %3:fr64 = DIVSDrm [[MOVSDrm_alt]], %fixed-stack.2, 1, $noreg, 0, $noreg, implicit $mxcsr :: (load (s64) from %fixed-stack.2)
-; CHECK: MOVSDmr killed [[MOV32rm1]], 1, $noreg, 0, $noreg, %3 :: (store (s64) into %ir.x, align 4)
-; CHECK: MOVSDmr killed [[MOV32rm]], 1, $noreg, 0, $noreg, %3 :: (store (s64) into %ir.y, align 4)
+; CHECK: [[Y:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.0)
+; CHECK: [[X:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.1, align 16)
+; CHECK: [[B:%[0-9]+]]:fr64 = MOVSDrm_alt %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.2)
+; CHECK: [[A:%[0-9]+]]:fr64 = MOVSDrm_alt %fixed-stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.3, align 16)
+; CHECK: [[DIV0:%[0-9]+]]:fr64 = DIVSDrr [[A]], [[B]], implicit $mxcsr
+; CHECK: [[DIV1:%[0-9]+]]:fr64 = nofpexcept DIVSDrr [[A]], [[B]], implicit $mxcsr
+; CHECK: MOVSDmr killed [[X]], 1, $noreg, 0, $noreg, [[DIV1]] :: (store (s64) into %ir.x, align 4)
+; CHECK: MOVSDmr killed [[Y]], 1, $noreg, 0, $noreg, [[DIV1]] :: (store (s64) into %ir.y, align 4)
 ; CHECK: RET 0
   %div = call double @llvm.experimental.constrained.fdiv.f64(double %a, double %b, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  %div1 = call double @llvm.experimental.constrained.fdiv.f64(double %a, double %b, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   %div2 = call double @llvm.experimental.constrained.fdiv.f64(double %a, double %b, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
-  store double %div, ptr %x
-  store double %div2, ptr %y
+  %div3 = call double @llvm.experimental.constrained.fdiv.f64(double %a, double %b, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+  store double %div2, ptr %x
+  store double %div3, ptr %y
   ret void
 }
 
-; These 2 sitofps only differ in their exception behavior and will be CSEd. Make
-; sure the nofpexcept flag is not set on the combined node.
+; These 4 divs only differ in their exception behavior. They form two groups,
+; whithin each the constrained functions have the same exception hehavior and
+; may be CSE'd. Instructions with different exception behavior belong to
+; different groups, they have different chain argument and cannot be CSE'd.
 define void @sitofp_cse(i32 %a, ptr %x, ptr %y) #0 {
 entry:
 ; CHECK-LABEL: name: sitofp_cse
-; CHECK: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.0, align 8)
-; CHECK: [[MOV32rm1:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.1)
-; CHECK: %2:fr64 = CVTSI2SDrm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.2, align 16)
-; CHECK: MOVSDmr killed [[MOV32rm1]], 1, $noreg, 0, $noreg, %2 :: (store (s64) into %ir.x, align 4)
-; CHECK: MOVSDmr killed [[MOV32rm]], 1, $noreg, 0, $noreg, %2 :: (store (s64) into %ir.y, align 4)
+; CHECK: [[Y:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.0, align 8)
+; CHECK: [[X:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.1)
+; CHECK: [[A:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.2, align 16)
+; CHECK: [[CVT0:%[0-9]+]]:fr64 = CVTSI2SDrr [[A]]
+; CHECK: [[CVT1:%[0-9]+]]:fr64 = nofpexcept CVTSI2SDrr [[A]]
+; CHECK: MOVSDmr killed [[X]], 1, $noreg, 0, $noreg, [[CVT1]] :: (store (s64) into %ir.x, align 4)
+; CHECK: MOVSDmr killed [[Y]], 1, $noreg, 0, $noreg, [[CVT1]] :: (store (s64) into %ir.y, align 4)
 ; CHECK: RET 0
   %result = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 %a, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  %result1 = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 %a, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   %result2 = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 %a, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
-  store double %result, ptr %x
-  store double %result2, ptr %y
+  %result3 = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 %a, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+  store double %result2, ptr %x
+  store double %result3, ptr %y
   ret void
 }
 
diff --git a/llvm/test/CodeGen/X86/fp128-g.ll b/llvm/test/CodeGen/X86/fp128-g.ll
index 58a57d3..d2b956f 100644
--- a/llvm/test/CodeGen/X86/fp128-g.ll
+++ b/llvm/test/CodeGen/X86/fp128-g.ll
@@ -106,8 +106,8 @@ entry:
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
 
-attributes #0 = { nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { norecurse nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "use-soft-float"="false" }
+attributes #1 = { norecurse nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
 
 !llvm.dbg.cu = !{!2}
diff --git a/llvm/test/CodeGen/X86/fp128-i128.ll b/llvm/test/CodeGen/X86/fp128-i128.ll
index f176a29..ef616ca 100644
--- a/llvm/test/CodeGen/X86/fp128-i128.ll
+++ b/llvm/test/CodeGen/X86/fp128-i128.ll
@@ -526,6 +526,6 @@ cleanup:                                          ; preds = %entry, %if.then
 }
 
 
-attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+ssse3,+sse3,+popcnt,+sse,+sse2,+sse4.1,+sse4.2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+ssse3,+sse3,+popcnt,+sse,+sse2,+sse4.1,+sse4.2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+ssse3,+sse3,+popcnt,+sse,+sse2,+sse4.1,+sse4.2" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+ssse3,+sse3,+popcnt,+sse,+sse2,+sse4.1,+sse4.2" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/X86/frame-order.ll b/llvm/test/CodeGen/X86/frame-order.ll
index dcbcb48..f410acf 100644
--- a/llvm/test/CodeGen/X86/frame-order.ll
+++ b/llvm/test/CodeGen/X86/frame-order.ll
@@ -74,9 +74,9 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 declare void @capture(ptr) #2
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
-attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
 attributes #3 = { nounwind }
 
 !llvm.dbg.cu = !{!0}
diff --git a/llvm/test/CodeGen/X86/fsafdo_test2.ll b/llvm/test/CodeGen/X86/fsafdo_test2.ll
index d83e241..fc4c1e8 100644
--- a/llvm/test/CodeGen/X86/fsafdo_test2.ll
+++ b/llvm/test/CodeGen/X86/fsafdo_test2.ll
@@ -196,10 +196,10 @@ if.end9.3:
 }
 
 
-attributes #0 = { noinline nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-soft-float"="false" }
 attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { nofree noinline norecurse nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nofree noinline norecurse nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-soft-float"="false" }
+attributes #3 = { nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/X86/i386-shrink-wrapping.ll b/llvm/test/CodeGen/X86/i386-shrink-wrapping.ll
index 7bb3bf42..4347d62 100644
--- a/llvm/test/CodeGen/X86/i386-shrink-wrapping.ll
+++ b/llvm/test/CodeGen/X86/i386-shrink-wrapping.ll
@@ -130,5 +130,5 @@ for.end:                                          ; preds = %for.cond.preheader
 ; Function Attrs: nounwind
 declare i32 @varfunc(ptr nocapture readonly, ...) #0
 
-attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse" "use-soft-float"="false" }
 attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/inline-asm-A-constraint.ll b/llvm/test/CodeGen/X86/inline-asm-A-constraint.ll
index f982196..11a1f39 100644
--- a/llvm/test/CodeGen/X86/inline-asm-A-constraint.ll
+++ b/llvm/test/CodeGen/X86/inline-asm-A-constraint.ll
@@ -23,7 +23,7 @@ entry:
 ; CHECK: lock
 ; CHECK-NEXT: cmpxchg16b
 
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 
 !llvm.ident = !{!0}
diff --git a/llvm/test/CodeGen/X86/label-annotation.ll b/llvm/test/CodeGen/X86/label-annotation.ll
index 626040c..05e4e87 100644
--- a/llvm/test/CodeGen/X86/label-annotation.ll
+++ b/llvm/test/CodeGen/X86/label-annotation.ll
@@ -77,8 +77,8 @@ entry:
 
 
 
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #2 = { inaccessiblememonly noduplicate nounwind }
 attributes #3 = { nounwind }
 
diff --git a/llvm/test/CodeGen/X86/label-heapallocsite.ll b/llvm/test/CodeGen/X86/label-heapallocsite.ll
index 31bca25..72834be6c 100644
--- a/llvm/test/CodeGen/X86/label-heapallocsite.ll
+++ b/llvm/test/CodeGen/X86/label-heapallocsite.ll
@@ -98,8 +98,8 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) #2
 ; CHECK-NEXT:  .short [[LABEL5]]-[[LABEL4]]
 ; CHECK-NEXT:  .long 4096
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone speculatable willreturn }
 attributes #3 = { nounwind }
 
diff --git a/llvm/test/CodeGen/X86/late-remat-update.mir b/llvm/test/CodeGen/X86/late-remat-update.mir
index 3212312..9108002 100644
--- a/llvm/test/CodeGen/X86/late-remat-update.mir
+++ b/llvm/test/CodeGen/X86/late-remat-update.mir
@@ -39,8 +39,8 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #2
   
-  attributes #0 = { noreturn uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { noreturn uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
   attributes #2 = { nounwind }
   
   !llvm.module.flags = !{!0, !1}
diff --git a/llvm/test/CodeGen/X86/lea-opt-memop-check-1.ll b/llvm/test/CodeGen/X86/lea-opt-memop-check-1.ll
index 5199b15..96780af0 100644
--- a/llvm/test/CodeGen/X86/lea-opt-memop-check-1.ll
+++ b/llvm/test/CodeGen/X86/lea-opt-memop-check-1.ll
@@ -92,4 +92,4 @@ if.end:
 ; CHECK:	pushl ([[REG3]])
 }
 
-attributes #0 = { nounwind optsize "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind optsize "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/lifetime-alias.ll b/llvm/test/CodeGen/X86/lifetime-alias.ll
index 3efaccb..22e350c 100644
--- a/llvm/test/CodeGen/X86/lifetime-alias.ll
+++ b/llvm/test/CodeGen/X86/lifetime-alias.ll
@@ -140,10 +140,10 @@ declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture reado
 ; Function Attrs: argmemonly nounwind
 declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #1
 
-attributes #0 = { norecurse uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #1 = { argmemonly nounwind }
-attributes #2 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nobuiltin nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #3 = { nobuiltin nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #4 = { nounwind }
 attributes #5 = { noreturn nounwind }
 attributes #6 = { builtin nounwind }
diff --git a/llvm/test/CodeGen/X86/limit-split-cost.mir b/llvm/test/CodeGen/X86/limit-split-cost.mir
index 5b8bb98..8e4e786 100644
--- a/llvm/test/CodeGen/X86/limit-split-cost.mir
+++ b/llvm/test/CodeGen/X86/limit-split-cost.mir
@@ -53,8 +53,8 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #2
   
-  attributes #0 = { uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
   attributes #2 = { nounwind }
   
   !llvm.module.flags = !{!0, !1}
diff --git a/llvm/test/CodeGen/X86/merge-vector-stores-scale-idx-crash.ll b/llvm/test/CodeGen/X86/merge-vector-stores-scale-idx-crash.ll
index 3dba5eb..206d453 100644
--- a/llvm/test/CodeGen/X86/merge-vector-stores-scale-idx-crash.ll
+++ b/llvm/test/CodeGen/X86/merge-vector-stores-scale-idx-crash.ll
@@ -41,5 +41,5 @@ if.end:                                           ; preds = %entry
 
 declare <4 x float> @_Z1bv() local_unnamed_addr 
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 
diff --git a/llvm/test/CodeGen/X86/misched-copy.ll b/llvm/test/CodeGen/X86/misched-copy.ll
index fa6cd15..e3ceddf 100644
--- a/llvm/test/CodeGen/X86/misched-copy.ll
+++ b/llvm/test/CodeGen/X86/misched-copy.ll
@@ -42,7 +42,7 @@ end:
   ret i64 %add
 }
 
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
 
 !0 = !{!"float", !1}
 !1 = !{!"omnipotent char", !2}
diff --git a/llvm/test/CodeGen/X86/misched-matmul.ll b/llvm/test/CodeGen/X86/misched-matmul.ll
index a6c489d..9029167 100644
--- a/llvm/test/CodeGen/X86/misched-matmul.ll
+++ b/llvm/test/CodeGen/X86/misched-matmul.ll
@@ -222,4 +222,4 @@ entry:
   ret void
 }
 
-attributes #0 = { noinline nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/movpc32-check.ll b/llvm/test/CodeGen/X86/movpc32-check.ll
index e3730d0..4585dcb 100644
--- a/llvm/test/CodeGen/X86/movpc32-check.ll
+++ b/llvm/test/CodeGen/X86/movpc32-check.ll
@@ -12,8 +12,8 @@ entry:
 
 declare void @bar(...) #1
 
-attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="i686" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="i686" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="i686" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="i686" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/X86/ms-inline-asm-avx512.ll b/llvm/test/CodeGen/X86/ms-inline-asm-avx512.ll
index ec17b1d..04db25b 100644
--- a/llvm/test/CodeGen/X86/ms-inline-asm-avx512.ll
+++ b/llvm/test/CodeGen/X86/ms-inline-asm-avx512.ll
@@ -20,5 +20,5 @@ entry:
 ; CHECK: movq    %rax, 7(%rsp)
 ; CHECK: retq
 
-attributes #0 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "use-soft-float"="false" }
 attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/nocf_check.ll b/llvm/test/CodeGen/X86/nocf_check.ll
index 7b184ed..742b07d 100644
--- a/llvm/test/CodeGen/X86/nocf_check.ll
+++ b/llvm/test/CodeGen/X86/nocf_check.ll
@@ -66,8 +66,8 @@ bb2:
   ret void
 }
 
-attributes #0 = { nocf_check noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nocf_check noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-soft-float"="false" }
+attributes #1 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-soft-float"="false" }
 attributes #2 = { nocf_check }
 
 !llvm.module.flags = !{!0}
diff --git a/llvm/test/CodeGen/X86/pr15705.ll b/llvm/test/CodeGen/X86/pr15705.ll
index 3dd4aab..2de9a34 100644
--- a/llvm/test/CodeGen/X86/pr15705.ll
+++ b/llvm/test/CodeGen/X86/pr15705.ll
@@ -45,4 +45,4 @@ return:
   ret i32 %retval.0
 }
 
-attributes #0 = { nounwind readnone ssp "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readnone ssp "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/pr18846.ll b/llvm/test/CodeGen/X86/pr18846.ll
index 93a9a5d..4239f46 100644
--- a/llvm/test/CodeGen/X86/pr18846.ll
+++ b/llvm/test/CodeGen/X86/pr18846.ll
@@ -122,7 +122,7 @@ for.body65:                                       ; preds = %for.body29
 ; Function Attrs: nounwind
 declare void @llvm.x86.avx.storeu.ps.256(ptr, <8 x float>) #1
 
-attributes #0 = { uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 
 !llvm.ident = !{!0}
diff --git a/llvm/test/CodeGen/X86/pr31045.ll b/llvm/test/CodeGen/X86/pr31045.ll
index 4aa73d7..78ba7cc 100644
--- a/llvm/test/CodeGen/X86/pr31045.ll
+++ b/llvm/test/CodeGen/X86/pr31045.ll
@@ -73,4 +73,4 @@ entry:
   ret void
 }
 
-attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/pr32610.ll b/llvm/test/CodeGen/X86/pr32610.ll
index dc11ba8..6f3602d 100644
--- a/llvm/test/CodeGen/X86/pr32610.ll
+++ b/llvm/test/CodeGen/X86/pr32610.ll
@@ -50,7 +50,7 @@ entry:
   ret void
 }
 
-attributes #0 = { norecurse nounwind optsize ssp "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind optsize ssp "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "use-soft-float"="false" }
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/CodeGen/X86/pr34080-2.ll b/llvm/test/CodeGen/X86/pr34080-2.ll
index de34bfb1..279373a 100644
--- a/llvm/test/CodeGen/X86/pr34080-2.ll
+++ b/llvm/test/CodeGen/X86/pr34080-2.ll
@@ -132,4 +132,4 @@ define void @computeJD(ptr) nounwind {
   ret void
 }
 
-attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="i486" "target-features"="+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="i486" "target-features"="+x87" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/pr34080.ll b/llvm/test/CodeGen/X86/pr34080.ll
index d07d1aa..3b46bd3 100644
--- a/llvm/test/CodeGen/X86/pr34080.ll
+++ b/llvm/test/CodeGen/X86/pr34080.ll
@@ -162,4 +162,4 @@ entry:
   ret void
 }
 
-attributes #0 = { noinline uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/pr34629.ll b/llvm/test/CodeGen/X86/pr34629.ll
index eeb61d2..f7747b1 100644
--- a/llvm/test/CodeGen/X86/pr34629.ll
+++ b/llvm/test/CodeGen/X86/pr34629.ll
@@ -38,7 +38,7 @@ if.end:                                           ; preds = %entry, %if.then
   ret void
 }
 
-attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/X86/pr34634.ll b/llvm/test/CodeGen/X86/pr34634.ll
index a374112..980961a 100644
--- a/llvm/test/CodeGen/X86/pr34634.ll
+++ b/llvm/test/CodeGen/X86/pr34634.ll
@@ -54,7 +54,7 @@ entry:
   ret i32 0
 }
 
-attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/X86/pr42727.ll b/llvm/test/CodeGen/X86/pr42727.ll
index cf1fa5a..18e884b 100644
--- a/llvm/test/CodeGen/X86/pr42727.ll
+++ b/llvm/test/CodeGen/X86/pr42727.ll
@@ -29,5 +29,5 @@ entry:
   ret void
 }
 
-attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+avx,+avx2,+cx8,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+avx,+avx2,+cx8,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "use-soft-float"="false" }
 
diff --git a/llvm/test/CodeGen/X86/pr48064.mir b/llvm/test/CodeGen/X86/pr48064.mir
index eb74edd..d2eaea7 100644
--- a/llvm/test/CodeGen/X86/pr48064.mir
+++ b/llvm/test/CodeGen/X86/pr48064.mir
@@ -185,8 +185,8 @@
   ; Function Attrs: nounwind
   declare void @llvm.x86.seh.ehregnode(ptr) #7
 
-  attributes #0 = { noinline nounwind sspstrong "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-  attributes #1 = { norecurse sspstrong "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { noinline nounwind sspstrong "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-soft-float"="false" }
+  attributes #1 = { norecurse sspstrong "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-soft-float"="false" }
   attributes #2 = { argmemonly nofree nosync nounwind willreturn }
   attributes #3 = { nofree }
   attributes #4 = { nofree nosync nounwind willreturn }
diff --git a/llvm/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll b/llvm/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll
index 0d5d822..e497575 100644
--- a/llvm/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll
+++ b/llvm/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll
@@ -178,4 +178,4 @@ bb439:                                            ; preds = %bb222, %bb85
   ret void
 }
 
-attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/recip-fastmath.ll b/llvm/test/CodeGen/X86/recip-fastmath.ll
index dab7a6a..f8d28ae 100644
--- a/llvm/test/CodeGen/X86/recip-fastmath.ll
+++ b/llvm/test/CodeGen/X86/recip-fastmath.ll
@@ -1400,7 +1400,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
   ret <16 x float> %div
 }
 
-attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!divf,!vec-divf" }
-attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf,vec-divf" }
-attributes #2 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:2,vec-divf:2" }
+attributes #0 = { "reciprocal-estimates"="!divf,!vec-divf" }
+attributes #1 = { "reciprocal-estimates"="divf,vec-divf" }
+attributes #2 = { "reciprocal-estimates"="divf:2,vec-divf:2" }
 
diff --git a/llvm/test/CodeGen/X86/recip-fastmath2.ll b/llvm/test/CodeGen/X86/recip-fastmath2.ll
index 77ccaff..7fa13cb 100644
--- a/llvm/test/CodeGen/X86/recip-fastmath2.ll
+++ b/llvm/test/CodeGen/X86/recip-fastmath2.ll
@@ -1841,8 +1841,8 @@ define <16 x float> @v16f32_no_step2(<16 x float> %x) #3 {
   ret <16 x float> %div
 }
 
-attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!divf,!vec-divf" }
-attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf,vec-divf" }
-attributes #2 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:2,vec-divf:2" }
-attributes #3 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:0,vec-divf:0" }
+attributes #0 = { "reciprocal-estimates"="!divf,!vec-divf" }
+attributes #1 = { "reciprocal-estimates"="divf,vec-divf" }
+attributes #2 = { "reciprocal-estimates"="divf:2,vec-divf:2" }
+attributes #3 = { "reciprocal-estimates"="divf:0,vec-divf:0" }
 
diff --git a/llvm/test/CodeGen/X86/regalloc-advanced-split-cost.ll b/llvm/test/CodeGen/X86/regalloc-advanced-split-cost.ll
index 50422a8..ea1ca51 100644
--- a/llvm/test/CodeGen/X86/regalloc-advanced-split-cost.ll
+++ b/llvm/test/CodeGen/X86/regalloc-advanced-split-cost.ll
@@ -72,7 +72,7 @@ if.end:                                           ; preds = %if.else, %if.then
   ret i32 %add
 }
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+x87" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 
 !llvm.module.flags = !{!0, !1}
diff --git a/llvm/test/CodeGen/X86/regparm.ll b/llvm/test/CodeGen/X86/regparm.ll
index 6d6802e..95009b5 100644
--- a/llvm/test/CodeGen/X86/regparm.ll
+++ b/llvm/test/CodeGen/X86/regparm.ll
@@ -38,7 +38,7 @@ entry:
 declare void @llvm.memset.p0.i32(ptr nocapture writeonly, i8, i32, i1) #1
 
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #1 = { argmemonly nounwind }
 
 !llvm.module.flags = !{!0}
diff --git a/llvm/test/CodeGen/X86/seh-catchpad.ll b/llvm/test/CodeGen/X86/seh-catchpad.ll
index cb85f39..85e465b 100644
--- a/llvm/test/CodeGen/X86/seh-catchpad.ll
+++ b/llvm/test/CodeGen/X86/seh-catchpad.ll
@@ -189,9 +189,9 @@ entry:
 ; Function Attrs: nounwind
 declare i32 @puts(ptr nocapture readonly) #3
 
-attributes #0 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { noinline nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+sse,+sse2" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+sse,+sse2" "use-soft-float"="false" }
+attributes #2 = { noinline nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+sse,+sse2" "use-soft-float"="false" }
+attributes #3 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+sse,+sse2" "use-soft-float"="false" }
 attributes #4 = { noinline }
 attributes #5 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/seh-except-finally.ll b/llvm/test/CodeGen/X86/seh-except-finally.ll
index 539d776..fedb0c4 100644
--- a/llvm/test/CodeGen/X86/seh-except-finally.ll
+++ b/llvm/test/CodeGen/X86/seh-except-finally.ll
@@ -136,10 +136,10 @@ declare ptr @llvm.localaddress() #4
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.eh.typeid.for(ptr) #4
 
-attributes #0 = { noinline nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { noinline nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "use-soft-float"="false" }
+attributes #1 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "use-soft-float"="false" }
+attributes #2 = { noinline nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "use-soft-float"="false" }
+attributes #3 = { "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "use-soft-float"="false" }
 attributes #4 = { nounwind readnone }
 attributes #5 = { noinline }
 attributes #6 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/seh-no-invokes.ll b/llvm/test/CodeGen/X86/seh-no-invokes.ll
index 63e91d3..112031c 100644
--- a/llvm/test/CodeGen/X86/seh-no-invokes.ll
+++ b/llvm/test/CodeGen/X86/seh-no-invokes.ll
@@ -63,8 +63,8 @@ declare i32 @_except_handler3(...)
 ; Function Attrs: nounwind
 declare void @llvm.localescape(...) #3
 
-attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
 attributes #3 = { nounwind }
 
diff --git a/llvm/test/CodeGen/X86/shrinkwrap-hang.ll b/llvm/test/CodeGen/X86/shrinkwrap-hang.ll
index fe42d31..7e98b8a 100644
--- a/llvm/test/CodeGen/X86/shrinkwrap-hang.ll
+++ b/llvm/test/CodeGen/X86/shrinkwrap-hang.ll
@@ -29,4 +29,4 @@ if.end3:                                          ; preds = %if.end
   ret void
 }
 
-attributes #0 = { norecurse nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
index a260b32..83bfcd7 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
@@ -1012,15 +1012,15 @@ define double @sqrt_simplify_before_recip_order(double %x, ptr %p) nounwind {
   ret double %sqrt_fast
 }
 
-attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!sqrtf,!vec-sqrtf,!divf,!vec-divf" }
-attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" }
+attributes #0 = { "reciprocal-estimates"="!sqrtf,!vec-sqrtf,!divf,!vec-divf" }
+attributes #1 = { "reciprocal-estimates"="sqrt,vec-sqrt" }
 attributes #2 = { nounwind readnone }
-attributes #3 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,ieee" }
-attributes #4 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="ieee,preserve-sign" }
-attributes #5 = { "unsafe-fp-math"="true" "reciprocal-estimates"="all:0" }
-attributes #6 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,dynamic" }
+attributes #3 = { "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,ieee" }
+attributes #4 = { "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="ieee,preserve-sign" }
+attributes #5 = { "reciprocal-estimates"="all:0" }
+attributes #6 = { "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,dynamic" }
 
-; Attributes without "unsafe-fp-math"="true"
+; Attributes without
 ; TODO: Merge with previous attributes when this attribute can be deleted.
 attributes #7 = { "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,ieee" } ; #3
 attributes #8 = { "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,dynamic" } ; #6
diff --git a/llvm/test/CodeGen/X86/sse1.ll b/llvm/test/CodeGen/X86/sse1.ll
index 8ac86d1..5005752 100644
--- a/llvm/test/CodeGen/X86/sse1.ll
+++ b/llvm/test/CodeGen/X86/sse1.ll
@@ -251,5 +251,5 @@ define <2 x float> @PR31672() #0 {
 
 declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) #1
 
-attributes #0 = { nounwind "unsafe-fp-math"="true" }
+attributes #0 = { nounwind }
 
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll
index 665a84a..d7c9438 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll
@@ -1912,7 +1912,7 @@ define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) {
 }
 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
 
-define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxpd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1927,7 +1927,7 @@ define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) #0 {
 }
 declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
 
-define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 {
+define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxpd_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1941,7 +1941,7 @@ define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double>
   ret <2 x double> %2
 }
 
-define <4 x double> @stack_fold_maxpd_ymm(<4 x double> %a0, <4 x double> %a1) #0 {
+define <4 x double> @stack_fold_maxpd_ymm(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxpd_ymm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -1956,7 +1956,7 @@ define <4 x double> @stack_fold_maxpd_ymm(<4 x double> %a0, <4 x double> %a1) #0
 }
 declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
 
-define <4 x double> @stack_fold_maxpd_ymm_commutable(<4 x double> %a0, <4 x double> %a1) #1 {
+define <4 x double> @stack_fold_maxpd_ymm_commutable(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxpd_ymm_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -1970,7 +1970,7 @@ define <4 x double> @stack_fold_maxpd_ymm_commutable(<4 x double> %a0, <4 x doub
   ret <4 x double> %2
 }
 
-define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxps:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1985,7 +1985,7 @@ define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) #0 {
 }
 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
 
-define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1) #1 {
+define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxps_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1999,7 +1999,7 @@ define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1
   ret <4 x float> %2
 }
 
-define <8 x float> @stack_fold_maxps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
+define <8 x float> @stack_fold_maxps_ymm(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxps_ymm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2014,7 +2014,7 @@ define <8 x float> @stack_fold_maxps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
 }
 declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
 
-define <8 x float> @stack_fold_maxps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) #1 {
+define <8 x float> @stack_fold_maxps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxps_ymm_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2028,7 +2028,7 @@ define <8 x float> @stack_fold_maxps_ymm_commutable(<8 x float> %a0, <8 x float>
   ret <8 x float> %2
 }
 
-define double @stack_fold_maxsd(double %a0, double %a1) #0 {
+define double @stack_fold_maxsd(double %a0, double %a1) {
 ; CHECK-LABEL: stack_fold_maxsd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -2043,7 +2043,7 @@ define double @stack_fold_maxsd(double %a0, double %a1) #0 {
   ret double %3
 }
 
-define double @stack_fold_maxsd_commutable(double %a0, double %a1) #1 {
+define double @stack_fold_maxsd_commutable(double %a0, double %a1) {
 ; CHECK-LABEL: stack_fold_maxsd_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -2058,7 +2058,7 @@ define double @stack_fold_maxsd_commutable(double %a0, double %a1) #1 {
   ret double %3
 }
 
-define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxsd_int:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2073,7 +2073,7 @@ define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) #0
 }
 declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
 
-define float @stack_fold_maxss(float %a0, float %a1) #0 {
+define float @stack_fold_maxss(float %a0, float %a1) {
 ; CHECK-LABEL: stack_fold_maxss:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -2088,7 +2088,7 @@ define float @stack_fold_maxss(float %a0, float %a1) #0 {
   ret float %3
 }
 
-define float @stack_fold_maxss_commutable(float %a0, float %a1) #1 {
+define float @stack_fold_maxss_commutable(float %a0, float %a1) {
 ; CHECK-LABEL: stack_fold_maxss_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -2103,7 +2103,7 @@ define float @stack_fold_maxss_commutable(float %a0, float %a1) #1 {
   ret float %3
 }
 
-define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxss_int:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2118,7 +2118,7 @@ define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) #0 {
 }
 declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
 
-define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_minpd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2133,7 +2133,7 @@ define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) #0 {
 }
 declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
 
-define <2 x double> @stack_fold_minpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 {
+define <2 x double> @stack_fold_minpd_commutable(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_minpd_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2147,7 +2147,7 @@ define <2 x double> @stack_fold_minpd_commutable(<2 x double> %a0, <2 x double>
   ret <2 x double> %2
 }
 
-define <4 x double> @stack_fold_minpd_ymm(<4 x double> %a0, <4 x double> %a1) #0 {
+define <4 x double> @stack_fold_minpd_ymm(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-LABEL: stack_fold_minpd_ymm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2162,7 +2162,7 @@ define <4 x double> @stack_fold_minpd_ymm(<4 x double> %a0, <4 x double> %a1) #0
 }
 declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
 
-define <4 x double> @stack_fold_minpd_ymm_commutable(<4 x double> %a0, <4 x double> %a1) #1 {
+define <4 x double> @stack_fold_minpd_ymm_commutable(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-LABEL: stack_fold_minpd_ymm_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2176,7 +2176,7 @@ define <4 x double> @stack_fold_minpd_ymm_commutable(<4 x double> %a0, <4 x doub
   ret <4 x double> %2
 }
 
-define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minps:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2191,7 +2191,7 @@ define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) #0 {
 }
 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
 
-define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1) #1 {
+define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minps_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2205,7 +2205,7 @@ define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1
   ret <4 x float> %2
 }
 
-define <8 x float> @stack_fold_minps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
+define <8 x float> @stack_fold_minps_ymm(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minps_ymm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2220,7 +2220,7 @@ define <8 x float> @stack_fold_minps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
 }
 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
 
-define <8 x float> @stack_fold_minps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) #1 {
+define <8 x float> @stack_fold_minps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minps_ymm_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2234,7 +2234,7 @@ define <8 x float> @stack_fold_minps_ymm_commutable(<8 x float> %a0, <8 x float>
   ret <8 x float> %2
 }
 
-define double @stack_fold_minsd(double %a0, double %a1) #0 {
+define double @stack_fold_minsd(double %a0, double %a1) {
 ; CHECK-LABEL: stack_fold_minsd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -2249,7 +2249,7 @@ define double @stack_fold_minsd(double %a0, double %a1) #0 {
   ret double %3
 }
 
-define double @stack_fold_minsd_commutable(double %a0, double %a1) #1 {
+define double @stack_fold_minsd_commutable(double %a0, double %a1) {
 ; CHECK-LABEL: stack_fold_minsd_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -2279,7 +2279,7 @@ define <2 x double> @stack_fold_minsd_int(<2 x double> %a0, <2 x double> %a1) {
 }
 declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
 
-define float @stack_fold_minss(float %a0, float %a1) #0 {
+define float @stack_fold_minss(float %a0, float %a1) {
 ; CHECK-LABEL: stack_fold_minss:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -2294,7 +2294,7 @@ define float @stack_fold_minss(float %a0, float %a1) #0 {
   ret float %3
 }
 
-define float @stack_fold_minss_commutable(float %a0, float %a1) #1 {
+define float @stack_fold_minss_commutable(float %a0, float %a1) {
 ; CHECK-LABEL: stack_fold_minss_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -2309,7 +2309,7 @@ define float @stack_fold_minss_commutable(float %a0, float %a1) #1 {
   ret float %3
 }
 
-define <4 x float> @stack_fold_minss_int(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_minss_int(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minss_int:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -3632,6 +3632,3 @@ define <8 x float> @stack_fold_xorps_ymm(<8 x float> %a0, <8 x float> %a1) {
   %6 = fadd <8 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
   ret <8 x float> %6
 }
-
-attributes #0 = { "unsafe-fp-math"="false" }
-attributes #1 = { "unsafe-fp-math"="true" }
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll
index a75cdf9d..43743d5 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll
@@ -609,7 +609,7 @@ define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) {
 }
 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
 
-define <8 x double> @stack_fold_maxpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 {
+define <8 x double> @stack_fold_maxpd_zmm(<8 x double> %a0, <8 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxpd_zmm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -695,7 +695,7 @@ define <8 x double> @stack_fold_maxpd_zmm_commutable_kz(<8 x double> %a0, <8 x d
   ret <8 x double> %4
 }
 
-define <16 x float> @stack_fold_maxps_zmm(<16 x float> %a0, <16 x float> %a1) #0 {
+define <16 x float> @stack_fold_maxps_zmm(<16 x float> %a0, <16 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxps_zmm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -781,7 +781,7 @@ define <16 x float> @stack_fold_maxps_zmm_commutable_kz(<16 x float> %a0, <16 x
   ret <16 x float> %4
 }
 
-define <8 x double> @stack_fold_minpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 {
+define <8 x double> @stack_fold_minpd_zmm(<8 x double> %a0, <8 x double> %a1) {
 ; CHECK-LABEL: stack_fold_minpd_zmm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -867,7 +867,7 @@ define <8 x double> @stack_fold_minpd_zmm_commutable_kz(<8 x double> %a0, <8 x d
   ret <8 x double> %4
 }
 
-define <16 x float> @stack_fold_minps_zmm(<16 x float> %a0, <16 x float> %a1) #0 {
+define <16 x float> @stack_fold_minps_zmm(<16 x float> %a0, <16 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minps_zmm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -1157,7 +1157,7 @@ define <4 x float> @stack_fold_mulss_int(<4 x float> %a0, <4 x float> %a1) {
   ret <4 x float> %5
 }
 
-define <8 x double> @stack_fold_orpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 {
+define <8 x double> @stack_fold_orpd_zmm(<8 x double> %a0, <8 x double> %a1) {
 ; CHECK-LABEL: stack_fold_orpd_zmm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -1178,7 +1178,7 @@ define <8 x double> @stack_fold_orpd_zmm(<8 x double> %a0, <8 x double> %a1) #0
   ret <8 x double> %6
 }
 
-define <16 x float> @stack_fold_orps_zmm(<16 x float> %a0, <16 x float> %a1) #0 {
+define <16 x float> @stack_fold_orps_zmm(<16 x float> %a0, <16 x float> %a1) {
 ; CHECK-LABEL: stack_fold_orps_zmm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -1414,7 +1414,7 @@ define <4 x float> @stack_fold_subss_int(<4 x float> %a0, <4 x float> %a1) {
   ret <4 x float> %5
 }
 
-define <8 x double> @stack_fold_xorpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 {
+define <8 x double> @stack_fold_xorpd_zmm(<8 x double> %a0, <8 x double> %a1) {
 ; CHECK-LABEL: stack_fold_xorpd_zmm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -1435,7 +1435,7 @@ define <8 x double> @stack_fold_xorpd_zmm(<8 x double> %a0, <8 x double> %a1) #0
   ret <8 x double> %6
 }
 
-define <16 x float> @stack_fold_xorps_zmm(<16 x float> %a0, <16 x float> %a1) #0 {
+define <16 x float> @stack_fold_xorps_zmm(<16 x float> %a0, <16 x float> %a1) {
 ; CHECK-LABEL: stack_fold_xorps_zmm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -2058,5 +2058,4 @@ define <16 x float> @stack_fold_permilpsvar_zmm_maskz(<16 x float> %a0, <16 x i3
   ret <16 x float> %4
 }
 
-attributes #0 = { "unsafe-fp-math"="false" }
-attributes #1 = { "unsafe-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }
+attributes #1 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll
index 52d4d8b..b715df8 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll
@@ -502,7 +502,7 @@ define <8 x half> @stack_fold_getmantsh_maskz(<8 x half> %a0, <8 x half> %a1, pt
   ret <8 x half> %3
 }
 
-define <32 x half> @stack_fold_maxph_zmm(<32 x half> %a0, <32 x half> %a1) #0 {
+define <32 x half> @stack_fold_maxph_zmm(<32 x half> %a0, <32 x half> %a1) {
 ; CHECK-LABEL: stack_fold_maxph_zmm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -517,7 +517,7 @@ define <32 x half> @stack_fold_maxph_zmm(<32 x half> %a0, <32 x half> %a1) #0 {
 }
 declare <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half>, <32 x half>, i32) nounwind readnone
 
-define <32 x half> @stack_fold_maxph_zmm_commuted(<32 x half> %a0, <32 x half> %a1) #0 {
+define <32 x half> @stack_fold_maxph_zmm_commuted(<32 x half> %a0, <32 x half> %a1) {
 ; CHECK-LABEL: stack_fold_maxph_zmm_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -532,7 +532,7 @@ define <32 x half> @stack_fold_maxph_zmm_commuted(<32 x half> %a0, <32 x half> %
   ret <32 x half> %2
 }
 
-define <32 x half> @stack_fold_maxph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #0 {
+define <32 x half> @stack_fold_maxph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) {
 ; CHECK-LABEL: stack_fold_maxph_zmm_k:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -552,7 +552,7 @@ define <32 x half> @stack_fold_maxph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32
   ret <32 x half> %5
 }
 
-define <32 x half> @stack_fold_maxph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #0 {
+define <32 x half> @stack_fold_maxph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) {
 ; CHECK-LABEL: stack_fold_maxph_zmm_k_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -573,7 +573,7 @@ define <32 x half> @stack_fold_maxph_zmm_k_commuted(<32 x half> %a0, <32 x half>
   ret <32 x half> %5
 }
 
-define <32 x half> @stack_fold_maxph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 {
+define <32 x half> @stack_fold_maxph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) {
 ; CHECK-LABEL: stack_fold_maxph_zmm_kz:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -590,7 +590,7 @@ define <32 x half> @stack_fold_maxph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i3
   ret <32 x half> %4
 }
 
-define <32 x half> @stack_fold_maxph_zmm_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 {
+define <32 x half> @stack_fold_maxph_zmm_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) {
 ; CHECK-LABEL: stack_fold_maxph_zmm_kz_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -710,7 +710,7 @@ define <32 x half> @stack_fold_maxph_zmm_commutable_kz_commuted(<32 x half> %a0,
   ret <32 x half> %4
 }
 
-define half @stack_fold_maxsh(half %a0, half %a1) #0 {
+define half @stack_fold_maxsh(half %a0, half %a1) {
 ; CHECK-LABEL: stack_fold_maxsh:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -725,7 +725,7 @@ define half @stack_fold_maxsh(half %a0, half %a1) #0 {
   ret half %3
 }
 
-define half @stack_fold_maxsh_commuted(half %a0, half %a1) #0 {
+define half @stack_fold_maxsh_commuted(half %a0, half %a1) {
 ; CHECK-LABEL: stack_fold_maxsh_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -772,7 +772,7 @@ define half @stack_fold_maxsh_commutable_commuted(half %a0, half %a1) #1 {
   ret half %3
 }
 
-define <8 x half> @stack_fold_maxsh_int(<8 x half> %a0, <8 x half> %a1) #0 {
+define <8 x half> @stack_fold_maxsh_int(<8 x half> %a0, <8 x half> %a1) {
 ; CHECK-LABEL: stack_fold_maxsh_int:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -820,7 +820,7 @@ define <8 x half> @stack_fold_maxsh_maskz(<8 x half> %a0, <8 x half> %a1, i8 %ma
   ret <8 x half> %2
 }
 
-define <32 x half> @stack_fold_minph_zmm(<32 x half> %a0, <32 x half> %a1) #0 {
+define <32 x half> @stack_fold_minph_zmm(<32 x half> %a0, <32 x half> %a1) {
 ; CHECK-LABEL: stack_fold_minph_zmm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -835,7 +835,7 @@ define <32 x half> @stack_fold_minph_zmm(<32 x half> %a0, <32 x half> %a1) #0 {
 }
 declare <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half>, <32 x half>, i32) nounwind readnone
 
-define <32 x half> @stack_fold_minph_zmm_commuted(<32 x half> %a0, <32 x half> %a1) #0 {
+define <32 x half> @stack_fold_minph_zmm_commuted(<32 x half> %a0, <32 x half> %a1) {
 ; CHECK-LABEL: stack_fold_minph_zmm_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -850,7 +850,7 @@ define <32 x half> @stack_fold_minph_zmm_commuted(<32 x half> %a0, <32 x half> %
   ret <32 x half> %2
 }
 
-define <32 x half> @stack_fold_minph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #0 {
+define <32 x half> @stack_fold_minph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) {
 ; CHECK-LABEL: stack_fold_minph_zmm_k:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -870,7 +870,7 @@ define <32 x half> @stack_fold_minph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32
   ret <32 x half> %5
 }
 
-define <32 x half> @stack_fold_minph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #0 {
+define <32 x half> @stack_fold_minph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) {
 ; CHECK-LABEL: stack_fold_minph_zmm_k_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -891,7 +891,7 @@ define <32 x half> @stack_fold_minph_zmm_k_commuted(<32 x half> %a0, <32 x half>
   ret <32 x half> %5
 }
 
-define <32 x half> @stack_fold_minph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 {
+define <32 x half> @stack_fold_minph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) {
 ; CHECK-LABEL: stack_fold_minph_zmm_kz:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -908,7 +908,7 @@ define <32 x half> @stack_fold_minph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i3
   ret <32 x half> %4
 }
 
-define <32 x half> @stack_fold_minph_zmm_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 {
+define <32 x half> @stack_fold_minph_zmm_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) {
 ; CHECK-LABEL: stack_fold_minph_zmm_kz_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -1028,7 +1028,7 @@ define <32 x half> @stack_fold_minph_zmm_commutable_kz_commuted(<32 x half> %a0,
   ret <32 x half> %4
 }
 
-define half @stack_fold_minsh(half %a0, half %a1) #0 {
+define half @stack_fold_minsh(half %a0, half %a1) {
 ; CHECK-LABEL: stack_fold_minsh:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -1043,7 +1043,7 @@ define half @stack_fold_minsh(half %a0, half %a1) #0 {
   ret half %3
 }
 
-define half @stack_fold_minsh_commuted(half %a0, half %a1) #0 {
+define half @stack_fold_minsh_commuted(half %a0, half %a1) {
 ; CHECK-LABEL: stack_fold_minsh_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -1090,7 +1090,7 @@ define half @stack_fold_minsh_commutable_commuted(half %a0, half %a1) #1 {
   ret half %3
 }
 
-define <8 x half> @stack_fold_minsh_int(<8 x half> %a0, <8 x half> %a1) #0 {
+define <8 x half> @stack_fold_minsh_int(<8 x half> %a0, <8 x half> %a1) {
 ; CHECK-LABEL: stack_fold_minsh_int:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2316,5 +2316,4 @@ define <4 x float> @stack_fold_fcmaddcsh_maskz(<4 x float> %a0, <4 x float> %a1,
 }
 declare <4 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
 
-attributes #0 = { "unsafe-fp-math"="false" }
-attributes #1 = { "unsafe-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }
+attributes #1 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll
index 4fed6bc..cd06f2d 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll
@@ -381,7 +381,7 @@ define <16 x half> @stack_fold_getmantph_maskz_ymm(<16 x half> %a0, ptr %mask) {
   ret <16 x half> %3
 }
 
-define <8 x half> @stack_fold_maxph(<8 x half> %a0, <8 x half> %a1) #0 {
+define <8 x half> @stack_fold_maxph(<8 x half> %a0, <8 x half> %a1) {
 ; CHECK-LABEL: stack_fold_maxph:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -396,7 +396,7 @@ define <8 x half> @stack_fold_maxph(<8 x half> %a0, <8 x half> %a1) #0 {
 }
 declare <8 x half> @llvm.x86.avx512fp16.max.ph.128(<8 x half>, <8 x half>) nounwind readnone
 
-define <8 x half> @stack_fold_maxph_commutable(<8 x half> %a0, <8 x half> %a1) #1 {
+define <8 x half> @stack_fold_maxph_commutable(<8 x half> %a0, <8 x half> %a1) {
 ; CHECK-LABEL: stack_fold_maxph_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -410,7 +410,7 @@ define <8 x half> @stack_fold_maxph_commutable(<8 x half> %a0, <8 x half> %a1) #
   ret <8 x half> %2
 }
 
-define <16 x half> @stack_fold_maxph_ymm(<16 x half> %a0, <16 x half> %a1) #0 {
+define <16 x half> @stack_fold_maxph_ymm(<16 x half> %a0, <16 x half> %a1) {
 ; CHECK-LABEL: stack_fold_maxph_ymm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -425,7 +425,7 @@ define <16 x half> @stack_fold_maxph_ymm(<16 x half> %a0, <16 x half> %a1) #0 {
 }
 declare <16 x half> @llvm.x86.avx512fp16.max.ph.256(<16 x half>, <16 x half>) nounwind readnone
 
-define <16 x half> @stack_fold_maxph_ymm_commutable(<16 x half> %a0, <16 x half> %a1) #1 {
+define <16 x half> @stack_fold_maxph_ymm_commutable(<16 x half> %a0, <16 x half> %a1) {
 ; CHECK-LABEL: stack_fold_maxph_ymm_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -439,7 +439,7 @@ define <16 x half> @stack_fold_maxph_ymm_commutable(<16 x half> %a0, <16 x half>
   ret <16 x half> %2
 }
 
-define <8 x half> @stack_fold_minph(<8 x half> %a0, <8 x half> %a1) #0 {
+define <8 x half> @stack_fold_minph(<8 x half> %a0, <8 x half> %a1) {
 ; CHECK-LABEL: stack_fold_minph:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -454,7 +454,7 @@ define <8 x half> @stack_fold_minph(<8 x half> %a0, <8 x half> %a1) #0 {
 }
 declare <8 x half> @llvm.x86.avx512fp16.min.ph.128(<8 x half>, <8 x half>) nounwind readnone
 
-define <8 x half> @stack_fold_minph_commutable(<8 x half> %a0, <8 x half> %a1) #1 {
+define <8 x half> @stack_fold_minph_commutable(<8 x half> %a0, <8 x half> %a1) {
 ; CHECK-LABEL: stack_fold_minph_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -468,7 +468,7 @@ define <8 x half> @stack_fold_minph_commutable(<8 x half> %a0, <8 x half> %a1) #
   ret <8 x half> %2
 }
 
-define <16 x half> @stack_fold_minph_ymm(<16 x half> %a0, <16 x half> %a1) #0 {
+define <16 x half> @stack_fold_minph_ymm(<16 x half> %a0, <16 x half> %a1) {
 ; CHECK-LABEL: stack_fold_minph_ymm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -483,7 +483,7 @@ define <16 x half> @stack_fold_minph_ymm(<16 x half> %a0, <16 x half> %a1) #0 {
 }
 declare <16 x half> @llvm.x86.avx512fp16.min.ph.256(<16 x half>, <16 x half>) nounwind readnone
 
-define <16 x half> @stack_fold_minph_ymm_commutable(<16 x half> %a0, <16 x half> %a1) #1 {
+define <16 x half> @stack_fold_minph_ymm_commutable(<16 x half> %a0, <16 x half> %a1) {
 ; CHECK-LABEL: stack_fold_minph_ymm_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -1471,6 +1471,3 @@ define <8 x float> @stack_fold_fcmaddc_maskz_ymm(<8 x float> %a0, <8 x float> %a
   ret <8 x float> %3
 }
 declare <8 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
-
-attributes #0 = { "unsafe-fp-math"="false" }
-attributes #1 = { "unsafe-fp-math"="true" }
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll
index b370a80..bd56e61 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll
@@ -457,7 +457,7 @@ define <4 x float> @stack_fold_cvtpd2ps_ymm(<4 x double> %a0) {
   ret <4 x float> %2
 }
 
-define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxpd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -472,7 +472,7 @@ define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) #0 {
 }
 declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
 
-define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 {
+define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxpd_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -486,7 +486,7 @@ define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double>
   ret <2 x double> %2
 }
 
-define <4 x double> @stack_fold_maxpd_ymm(<4 x double> %a0, <4 x double> %a1) #0 {
+define <4 x double> @stack_fold_maxpd_ymm(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxpd_ymm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -501,7 +501,7 @@ define <4 x double> @stack_fold_maxpd_ymm(<4 x double> %a0, <4 x double> %a1) #0
 }
 declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
 
-define <4 x double> @stack_fold_maxpd_ymm_commutable(<4 x double> %a0, <4 x double> %a1) #1 {
+define <4 x double> @stack_fold_maxpd_ymm_commutable(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxpd_ymm_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -515,7 +515,7 @@ define <4 x double> @stack_fold_maxpd_ymm_commutable(<4 x double> %a0, <4 x doub
   ret <4 x double> %2
 }
 
-define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxps:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -530,7 +530,7 @@ define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) #0 {
 }
 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
 
-define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1) #1 {
+define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxps_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -544,7 +544,7 @@ define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1
   ret <4 x float> %2
 }
 
-define <8 x float> @stack_fold_maxps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
+define <8 x float> @stack_fold_maxps_ymm(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxps_ymm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -559,7 +559,7 @@ define <8 x float> @stack_fold_maxps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
 }
 declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
 
-define <8 x float> @stack_fold_maxps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) #1 {
+define <8 x float> @stack_fold_maxps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxps_ymm_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -573,7 +573,7 @@ define <8 x float> @stack_fold_maxps_ymm_commutable(<8 x float> %a0, <8 x float>
   ret <8 x float> %2
 }
 
-define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minps:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -588,7 +588,7 @@ define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) #0 {
 }
 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
 
-define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1) #1 {
+define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minps_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -602,7 +602,7 @@ define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1
   ret <4 x float> %2
 }
 
-define <8 x float> @stack_fold_minps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
+define <8 x float> @stack_fold_minps_ymm(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minps_ymm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -617,7 +617,7 @@ define <8 x float> @stack_fold_minps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
 }
 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
 
-define <8 x float> @stack_fold_minps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) #1 {
+define <8 x float> @stack_fold_minps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minps_ymm_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -687,7 +687,7 @@ define <8 x float> @stack_fold_mulps_ymm(<8 x float> %a0, <8 x float> %a1) {
   ret <8 x float> %2
 }
 
-define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_orpd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -708,7 +708,7 @@ define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) #0 {
   ret <2 x double> %6
 }
 
-define <4 x double> @stack_fold_orpd_ymm(<4 x double> %a0, <4 x double> %a1) #0 {
+define <4 x double> @stack_fold_orpd_ymm(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-LABEL: stack_fold_orpd_ymm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -939,7 +939,7 @@ define <8 x float> @stack_fold_subps_ymm(<8 x float> %a0, <8 x float> %a1) {
   ret <8 x float> %2
 }
 
-define <2 x double> @stack_fold_xorpd(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_xorpd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_xorpd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -960,7 +960,7 @@ define <2 x double> @stack_fold_xorpd(<2 x double> %a0, <2 x double> %a1) #0 {
   ret <2 x double> %6
 }
 
-define <4 x double> @stack_fold_xorpd_ymm(<4 x double> %a0, <4 x double> %a1) #0 {
+define <4 x double> @stack_fold_xorpd_ymm(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-LABEL: stack_fold_xorpd_ymm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -1391,6 +1391,3 @@ declare <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float>, <4 x i32>, <
 declare <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double>, <2 x i64>, <2 x double>)
 declare <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>)
 declare <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double>, <4 x i64>, <4 x double>)
-
-attributes #0 = { "unsafe-fp-math"="false" }
-attributes #1 = { "unsafe-fp-math"="true" }
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll b/llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll
index 306ee31..9bc9a9c 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll
@@ -1424,7 +1424,7 @@ define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) {
 }
 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
 
-define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxpd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1439,7 +1439,7 @@ define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) #0 {
 }
 declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
 
-define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 {
+define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxpd_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1453,7 +1453,7 @@ define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double>
   ret <2 x double> %2
 }
 
-define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxps:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1468,7 +1468,7 @@ define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) #0 {
 }
 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
 
-define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1) #1 {
+define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxps_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1482,7 +1482,7 @@ define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1
   ret <4 x float> %2
 }
 
-define double @stack_fold_maxsd(double %a0, double %a1) #0 {
+define double @stack_fold_maxsd(double %a0, double %a1) {
 ; CHECK-LABEL: stack_fold_maxsd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -1497,7 +1497,7 @@ define double @stack_fold_maxsd(double %a0, double %a1) #0 {
   ret double %3
 }
 
-define double @stack_fold_maxsd_commutable(double %a0, double %a1) #1 {
+define double @stack_fold_maxsd_commutable(double %a0, double %a1) {
 ; CHECK-LABEL: stack_fold_maxsd_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -1512,7 +1512,7 @@ define double @stack_fold_maxsd_commutable(double %a0, double %a1) #1 {
   ret double %3
 }
 
-define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxsd_int:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1527,7 +1527,7 @@ define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) #0
 }
 declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
 
-define float @stack_fold_maxss(float %a0, float %a1) #0 {
+define float @stack_fold_maxss(float %a0, float %a1) {
 ; CHECK-LABEL: stack_fold_maxss:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -1542,7 +1542,7 @@ define float @stack_fold_maxss(float %a0, float %a1) #0 {
   ret float %3
 }
 
-define float @stack_fold_maxss_commutable(float %a0, float %a1) #1 {
+define float @stack_fold_maxss_commutable(float %a0, float %a1) {
 ; CHECK-LABEL: stack_fold_maxss_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -1557,7 +1557,7 @@ define float @stack_fold_maxss_commutable(float %a0, float %a1) #1 {
   ret float %3
 }
 
-define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxss_int:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1572,7 +1572,7 @@ define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) #0 {
 }
 declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
 
-define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_minpd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1587,7 +1587,7 @@ define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) #0 {
 }
 declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
 
-define <2 x double> @stack_fold_minpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 {
+define <2 x double> @stack_fold_minpd_commutable(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_minpd_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1601,7 +1601,7 @@ define <2 x double> @stack_fold_minpd_commutable(<2 x double> %a0, <2 x double>
   ret <2 x double> %2
 }
 
-define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minps:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1616,7 +1616,7 @@ define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) #0 {
 }
 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
 
-define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1) #1 {
+define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minps_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1630,7 +1630,7 @@ define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1
   ret <4 x float> %2
 }
 
-define double @stack_fold_minsd(double %a0, double %a1) #0 {
+define double @stack_fold_minsd(double %a0, double %a1) {
 ; CHECK-LABEL: stack_fold_minsd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -1645,7 +1645,7 @@ define double @stack_fold_minsd(double %a0, double %a1) #0 {
   ret double %3
 }
 
-define double @stack_fold_minsd_commutable(double %a0, double %a1) #1 {
+define double @stack_fold_minsd_commutable(double %a0, double %a1) {
 ; CHECK-LABEL: stack_fold_minsd_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -1660,7 +1660,7 @@ define double @stack_fold_minsd_commutable(double %a0, double %a1) #1 {
   ret double %3
 }
 
-define <2 x double> @stack_fold_minsd_int(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_minsd_int(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_minsd_int:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1675,7 +1675,7 @@ define <2 x double> @stack_fold_minsd_int(<2 x double> %a0, <2 x double> %a1) #0
 }
 declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
 
-define float @stack_fold_minss(float %a0, float %a1) #0 {
+define float @stack_fold_minss(float %a0, float %a1) {
 ; CHECK-LABEL: stack_fold_minss:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -1690,7 +1690,7 @@ define float @stack_fold_minss(float %a0, float %a1) #0 {
   ret float %3
 }
 
-define float @stack_fold_minss_commutable(float %a0, float %a1) #1 {
+define float @stack_fold_minss_commutable(float %a0, float %a1) {
 ; CHECK-LABEL: stack_fold_minss_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -1705,7 +1705,7 @@ define float @stack_fold_minss_commutable(float %a0, float %a1) #1 {
   ret float %3
 }
 
-define <4 x float> @stack_fold_minss_int(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_minss_int(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minss_int:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2490,6 +2490,3 @@ define <4 x float> @stack_fold_xorps(<4 x float> %a0, <4 x float> %a1) {
 
 declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
-
-attributes #0 = { "unsafe-fp-math"="false" }
-attributes #1 = { "unsafe-fp-math"="true" }
diff --git a/llvm/test/CodeGen/X86/stack-protector-3.ll b/llvm/test/CodeGen/X86/stack-protector-3.ll
index 59784af..8ca6a56 100644
--- a/llvm/test/CodeGen/X86/stack-protector-3.ll
+++ b/llvm/test/CodeGen/X86/stack-protector-3.ll
@@ -118,7 +118,7 @@ declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
 ; Function Attrs: argmemonly nounwind willreturn
 declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #1
 
-attributes #0 = { nounwind sspreq uwtable writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind sspreq uwtable writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-soft-float"="false" }
 attributes #1 = { argmemonly nounwind willreturn }
 attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/X86/stack-protector-vreg-to-vreg-copy.ll b/llvm/test/CodeGen/X86/stack-protector-vreg-to-vreg-copy.ll
index 63390e4..4bc91bf 100644
--- a/llvm/test/CodeGen/X86/stack-protector-vreg-to-vreg-copy.ll
+++ b/llvm/test/CodeGen/X86/stack-protector-vreg-to-vreg-copy.ll
@@ -55,6 +55,6 @@ entry:
 
 declare void @f(i32) #1
 
-attributes #0 = { nounwind sspreq "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind sspreq "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/stack_guard_remat.ll b/llvm/test/CodeGen/X86/stack_guard_remat.ll
index f7a602c..f53fa0b4 100644
--- a/llvm/test/CodeGen/X86/stack_guard_remat.ll
+++ b/llvm/test/CodeGen/X86/stack_guard_remat.ll
@@ -23,4 +23,4 @@ declare void @foo3(ptr)
 ; Function Attrs: nounwind
 declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
 
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/tail-merge-wineh.ll b/llvm/test/CodeGen/X86/tail-merge-wineh.ll
index 00bddc1..a208368 100644
--- a/llvm/test/CodeGen/X86/tail-merge-wineh.ll
+++ b/llvm/test/CodeGen/X86/tail-merge-wineh.ll
@@ -101,5 +101,5 @@ declare x86_stdcallcc void @_CxxThrowException(ptr, ptr)
 
 declare i32 @__CxxFrameHandler3(...)
 
-attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { noreturn }
diff --git a/llvm/test/CodeGen/X86/tls-shrink-wrapping.ll b/llvm/test/CodeGen/X86/tls-shrink-wrapping.ll
index 2749ebd..f0d5a16 100644
--- a/llvm/test/CodeGen/X86/tls-shrink-wrapping.ll
+++ b/llvm/test/CodeGen/X86/tls-shrink-wrapping.ll
@@ -51,6 +51,6 @@ if.end:                                           ; preds = %if.then, %entry
 
 declare void @f(...) #1
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
 attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/unused_stackslots.ll b/llvm/test/CodeGen/X86/unused_stackslots.ll
index d909dd4..4d390bd 100644
--- a/llvm/test/CodeGen/X86/unused_stackslots.ll
+++ b/llvm/test/CodeGen/X86/unused_stackslots.ll
@@ -215,8 +215,8 @@ declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64,
 ; Function Attrs: argmemonly nounwind
 declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #1
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
 attributes #1 = { argmemonly nounwind }
-attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
 attributes #3 = { nounwind }
 
diff --git a/llvm/test/CodeGen/X86/uwtables.ll b/llvm/test/CodeGen/X86/uwtables.ll
index 1e2e1d9..68a5ff1 100644
--- a/llvm/test/CodeGen/X86/uwtables.ll
+++ b/llvm/test/CodeGen/X86/uwtables.ll
@@ -38,5 +38,5 @@ declare i32 @__gxx_personality_v0(...)
 declare void @__cxa_call_unexpected(ptr) local_unnamed_addr
 
 
-attributes #0 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
index 910dd1e..5954e34 100644
--- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
@@ -5435,7 +5435,7 @@ define double @extract3_uitofp_v4i32_f64(<4 x i32> %x) nounwind {
   ret double %r
 }
 
-define void @PR43609(ptr nocapture %x, <2 x i64> %y) #0 {
+define void @PR43609(ptr nocapture %x, <2 x i64> %y) {
 ; SSE2-LABEL: PR43609:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2,2]
@@ -5643,6 +5643,3 @@ define void @PR43609(ptr nocapture %x, <2 x i64> %y) #0 {
   store <2 x double> %t23, ptr %t26, align 8
   ret void
 }
-
-attributes #0 = { "unsafe-fp-math"="true" }
-
diff --git a/llvm/test/CodeGen/X86/vector-sqrt.ll b/llvm/test/CodeGen/X86/vector-sqrt.ll
index b08784a..843f099a 100644
--- a/llvm/test/CodeGen/X86/vector-sqrt.ll
+++ b/llvm/test/CodeGen/X86/vector-sqrt.ll
@@ -63,6 +63,6 @@ entry:
 ; Function Attrs: nounwind readnone
 declare float @sqrtf(float) local_unnamed_addr #1
 
-attributes #0 = { nounwind readonly uwtable "target-features"="+avx" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone  "target-features"="+avx2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readonly uwtable "target-features"="+avx" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone  "target-features"="+avx2" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/X86/vector-width-store-merge.ll b/llvm/test/CodeGen/X86/vector-width-store-merge.ll
index 50c7b01..9363348 100644
--- a/llvm/test/CodeGen/X86/vector-width-store-merge.ll
+++ b/llvm/test/CodeGen/X86/vector-width-store-merge.ll
@@ -85,8 +85,8 @@ entry:
 ; Function Attrs: argmemonly nounwind
 declare void @llvm.memmove.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1 immarg) #1
 
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "prefer-vector-width"="128" "stack-protector-buffer-size"="8" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "prefer-vector-width"="128" "stack-protector-buffer-size"="8" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "use-soft-float"="false" }
 attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "prefer-vector-width"="256" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "prefer-vector-width"="256" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "use-soft-float"="false" }
 
 !0 = !{i32 1, !"wchar_size", i32 4}
diff --git a/llvm/test/CodeGen/X86/win-cleanuppad.ll b/llvm/test/CodeGen/X86/win-cleanuppad.ll
index e9265a1..59dcccc 100644
--- a/llvm/test/CodeGen/X86/win-cleanuppad.ll
+++ b/llvm/test/CodeGen/X86/win-cleanuppad.ll
@@ -194,6 +194,6 @@ cleanup.outer:                                      ; preds = %invoke.cont.1, %c
 ; X64-NEXT: .long   .Ltmp7@IMGREL
 ; X64-NEXT: .long   -1
 
-attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/win32-seh-catchpad.ll b/llvm/test/CodeGen/X86/win32-seh-catchpad.ll
index 832ddca..0f51866 100644
--- a/llvm/test/CodeGen/X86/win32-seh-catchpad.ll
+++ b/llvm/test/CodeGen/X86/win32-seh-catchpad.ll
@@ -220,7 +220,7 @@ declare i32 @_except_handler3(...)
 ; Function Attrs: nounwind
 declare void @llvm.localescape(...) #2
 
-attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind }
 attributes #3 = { noinline }
diff --git a/llvm/test/CodeGen/X86/win32-seh-nested-finally.ll b/llvm/test/CodeGen/X86/win32-seh-nested-finally.ll
index 5b1f9b3..5095460 100644
--- a/llvm/test/CodeGen/X86/win32-seh-nested-finally.ll
+++ b/llvm/test/CodeGen/X86/win32-seh-nested-finally.ll
@@ -34,8 +34,8 @@ declare void @f(i32) #0
 
 declare i32 @_except_handler3(...)
 
-attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noinline nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { noinline nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
 attributes #3 = { noinline }
 
diff --git a/llvm/test/CodeGen/X86/windows-seh-EHa-CppCatchDotDotDot.ll b/llvm/test/CodeGen/X86/windows-seh-EHa-CppCatchDotDotDot.ll
index 785c260..d4d4fe3 100644
--- a/llvm/test/CodeGen/X86/windows-seh-EHa-CppCatchDotDotDot.ll
+++ b/llvm/test/CodeGen/X86/windows-seh-EHa-CppCatchDotDotDot.ll
@@ -272,12 +272,12 @@ declare dso_local void @llvm.seh.try.end() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.eh.exceptioncode(token) #3
 
-attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #1 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #2 = { nounwind willreturn }
 attributes #3 = { nounwind readnone }
-attributes #4 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #5 = { noinline norecurse optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #5 = { noinline norecurse optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #6 = { nounwind }
 attributes #7 = { noreturn }
 attributes #8 = { noinline }
diff --git a/llvm/test/CodeGen/X86/windows-seh-EHa-CppDtors01.ll b/llvm/test/CodeGen/X86/windows-seh-EHa-CppDtors01.ll
index 6c6e9c3..b0baaac 100644
--- a/llvm/test/CodeGen/X86/windows-seh-EHa-CppDtors01.ll
+++ b/llvm/test/CodeGen/X86/windows-seh-EHa-CppDtors01.ll
@@ -240,12 +240,12 @@ declare i32 @llvm.eh.exceptioncode(token) #1
 
 declare dso_local void @"?printf@@YAXZZ"(...) #5
 
-attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
-attributes #2 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { noinline norecurse optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #3 = { noinline norecurse optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #4 = { nounwind willreturn }
-attributes #5 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #6 = { nounwind }
 attributes #7 = { noinline }
 
diff --git a/llvm/test/CodeGen/X86/windows-seh-EHa-TryInFinally.ll b/llvm/test/CodeGen/X86/windows-seh-EHa-TryInFinally.ll
index 9e44299..d3da5f8 100644
--- a/llvm/test/CodeGen/X86/windows-seh-EHa-TryInFinally.ll
+++ b/llvm/test/CodeGen/X86/windows-seh-EHa-TryInFinally.ll
@@ -209,10 +209,10 @@ declare i32 @llvm.eh.exceptioncode(token) #4
 ; Function Attrs: nounwind
 declare void @llvm.localescape(...) #5
 
-attributes #0 = { noinline norecurse optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline norecurse optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #2 = { nounwind willreturn }
-attributes #3 = { noinline "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { noinline "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #4 = { nounwind readnone }
 attributes #5 = { nounwind }
 attributes #6 = { noinline }
diff --git a/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll b/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
index cb12481..9f888f8 100644
--- a/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
+++ b/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
@@ -24,7 +24,7 @@ entry:
   ret i64 %or
 }
 
-attributes #0 = { minsize nounwind readnone uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { minsize nounwind readnone uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 
 ; clang -Os -c test2.cpp -emit-llvm -S
@@ -63,7 +63,7 @@ entry:
   ret i64 %or
 }
 
-attributes #1 = { nounwind optsize readnone uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind optsize readnone uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 ; clang -O2 -c test2.cpp -emit-llvm -S
 ; Verify that we do not generate shld insruction when we are not optimizing
@@ -89,7 +89,7 @@ entry:
   ret i64 %or
 }
 
-attributes #2= { nounwind readnone uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2= { nounwind readnone uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 !llvm.module.flags = !{!0}
 !0 = !{i32 1, !"ProfileSummary", !1}
diff --git a/llvm/test/DebugInfo/X86/aggressive-instcombine-store-merge-dbg.ll b/llvm/test/DebugInfo/X86/aggressive-instcombine-store-merge-dbg.ll
new file mode 100644
index 0000000..f6e941a
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/aggressive-instcombine-store-merge-dbg.ll
@@ -0,0 +1,49 @@
+; RUN: opt -S -passes=aggressive-instcombine -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+;; Aggressive instcombine merges the two i8 stores into an i16 store. Check
+;; the debug location and DIAssignID metadata get merged.
+
+; CHECK: define void @test_i16(i16 %x, ptr %p) !dbg ![[#]] {
+; CHECK-NEXT: store i16 %x, ptr %p, align 1, !dbg ![[DBG:[0-9]+]], !DIAssignID ![[ID:[0-9]+]]
+; CHECK-NEXT: #dbg_assign(i16 %x, ![[#]],
+;    CHECK-SAME: !DIExpression(DW_OP_LLVM_convert, 16, DW_ATE_unsigned, DW_OP_LLVM_convert, 8, DW_ATE_unsigned, DW_OP_stack_value, DW_OP_LLVM_fragment, 0, 8),
+;    CHECK-SAME: ![[ID]], ptr %p, !DIExpression(), ![[#]])
+; CHECK-NEXT: #dbg_assign(i16 %x, ![[#]],
+;    CHECK-SAME: !DIExpression(DW_OP_constu, 8, DW_OP_shr, DW_OP_LLVM_convert, 16, DW_ATE_unsigned, DW_OP_LLVM_convert, 8, DW_ATE_unsigned, DW_OP_stack_value, DW_OP_LLVM_fragment, 8, 8),
+;    CHECK-SAME: ![[ID]], ptr %p, !DIExpression(DW_OP_plus_uconst, 1), ![[#]])
+; CHECK-NEXT: ret void
+
+; CHECK: ![[DBG]] = !DILocation(line: 0, scope: ![[#]])
+
+define void @test_i16(i16 %x, ptr %p) !dbg !5 {
+  %x.0 = trunc i16 %x to i8
+  store i8 %x.0, ptr %p, align 1, !dbg !16, !DIAssignID !17
+    #dbg_assign(i8 %x.0, !9, !DIExpression(DW_OP_LLVM_fragment, 0, 8), !17, ptr %p, !DIExpression(), !18)
+  %shr.1 = lshr i16 %x, 8
+  %x.1 = trunc i16 %shr.1 to i8
+  %gep.1 = getelementptr i8, ptr %p, i64 1
+  store i8 %x.1, ptr %gep.1, align 1, !dbg !19, !DIAssignID !20
+    #dbg_assign(i8 %x.1, !9, !DIExpression(DW_OP_LLVM_fragment, 8, 8), !20, ptr %gep.1, !DIExpression(), !18)
+  ret void
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.debugify = !{!2, !3}
+!llvm.module.flags = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = !DIFile(filename: "/app/example.ll", directory: "/")
+!2 = !{i32 7}
+!3 = !{i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "test_i16", linkageName: "test_i16", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8)
+!6 = !DISubroutineType(types: !7)
+!7 = !{}
+!8 = !{!9}
+!9 = !DILocalVariable(name: "1", scope: !5, file: !1, line: 1, type: !10)
+!10 = !DIBasicType(name: "ty16", size: 16, encoding: DW_ATE_unsigned)
+!16 = !DILocation(line: 2, column: 1, scope: !5)
+!17 = distinct !DIAssignID()
+!18 = !DILocation(line: 1, column: 1, scope: !5)
+!19 = !DILocation(line: 6, column: 1, scope: !5)
+!20 = distinct !DIAssignID()
diff --git a/llvm/test/LTO/AArch64/TestInputs/bar.ll b/llvm/test/LTO/AArch64/Inputs/bar.ll
index 7c2a753..7c2a753 100644
--- a/llvm/test/LTO/AArch64/TestInputs/bar.ll
+++ b/llvm/test/LTO/AArch64/Inputs/bar.ll
diff --git a/llvm/test/LTO/AArch64/TestInputs/fiz.ll b/llvm/test/LTO/AArch64/Inputs/fiz.ll
index e578426..e578426 100644
--- a/llvm/test/LTO/AArch64/TestInputs/fiz.ll
+++ b/llvm/test/LTO/AArch64/Inputs/fiz.ll
diff --git a/llvm/test/LTO/AArch64/TestInputs/foo.ll b/llvm/test/LTO/AArch64/Inputs/foo.ll
index 689d938..689d938 100644
--- a/llvm/test/LTO/AArch64/TestInputs/foo.ll
+++ b/llvm/test/LTO/AArch64/Inputs/foo.ll
diff --git a/llvm/test/LTO/AArch64/TestInputs/old.ll b/llvm/test/LTO/AArch64/Inputs/old.ll
index 2b1758b..2b1758b 100644
--- a/llvm/test/LTO/AArch64/TestInputs/old.ll
+++ b/llvm/test/LTO/AArch64/Inputs/old.ll
diff --git a/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll b/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll
index aef8907..20254de 100644
--- a/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll
+++ b/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll
@@ -2,7 +2,7 @@
 ;; be mixed.
 ;;
 ; RUN: llvm-as %s -o %t1.bc
-; RUN: llvm-as %p/TestInputs/foo.ll -o %t2.bc
+; RUN: llvm-as %p/Inputs/foo.ll -o %t2.bc
 ; RUN: llvm-lto -exported-symbol main \
 ; RUN:          -exported-symbol foo_on \
 ; RUN:          -filetype=obj \
diff --git a/llvm/test/LTO/AArch64/link-sign-return-address.ll b/llvm/test/LTO/AArch64/link-sign-return-address.ll
index df6276f..331e481 100644
--- a/llvm/test/LTO/AArch64/link-sign-return-address.ll
+++ b/llvm/test/LTO/AArch64/link-sign-return-address.ll
@@ -2,10 +2,10 @@
 ;; be mixed.
 ;
 ; RUN: llvm-as %s -o %t1.bc
-; RUN: llvm-as %p/TestInputs/foo.ll -o %t2.bc
-; RUN: llvm-as %p/TestInputs/fiz.ll -o %t3.bc
-; RUN: llvm-as %p/TestInputs/bar.ll -o %t4.bc
-; RUN: llvm-as %p/TestInputs/old.ll -o %t5.bc
+; RUN: llvm-as %p/Inputs/foo.ll -o %t2.bc
+; RUN: llvm-as %p/Inputs/fiz.ll -o %t3.bc
+; RUN: llvm-as %p/Inputs/bar.ll -o %t4.bc
+; RUN: llvm-as %p/Inputs/old.ll -o %t5.bc
 ; RUN: llvm-lto -exported-symbol main \
 ; RUN:          -exported-symbol foo_on \
 ; RUN:          -exported-symbol foo_off \
diff --git a/llvm/test/MC/AArch64/FP8/fmmla-diagnostics.s b/llvm/test/MC/AArch64/FP8/fmmla-diagnostics.s
index cf8d21658..15efbee 100644
--- a/llvm/test/MC/AArch64/FP8/fmmla-diagnostics.s
+++ b/llvm/test/MC/AArch64/FP8/fmmla-diagnostics.s
@@ -16,7 +16,7 @@ fmmla v0.4s, v1.4s, v2.4s
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
 fmmla v0.8h, v1.8h, v2.8h
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction requires: f16mm
 // CHECK-NEXT: fmmla v0.8h, v1.8h, v2.8h
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
diff --git a/llvm/test/MC/AArch64/SME2p3/luti6-diagnostics.s b/llvm/test/MC/AArch64/SME2p3/luti6-diagnostics.s
new file mode 100644
index 0000000..c25ff66
--- /dev/null
+++ b/llvm/test/MC/AArch64/SME2p3/luti6-diagnostics.s
@@ -0,0 +1,191 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p3 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+luti6 z0.h, zt0, z0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: luti6 z0.h, zt0, z0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 z0.s, zt0, z0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: luti6 z0.s, zt0, z0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 z0.d, zt0, z0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: luti6 z0.d, zt0, z0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0.h, p0/m, z7.h
+luti6 z0.b, zt0, z1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 z0.b, zt0, z1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0, z7
+luti6 z0.b, zt0, z1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 z0.b, zt0, z1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid vectors/mis-matched registers/invalid index
+
+luti6 { z0.h - z5.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid number of vectors
+// CHECK-NEXT: luti6 { z0.h - z5.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z0.b - z3.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix
+// CHECK-NEXT: luti6 { z0.b - z3.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z0, z1 }[2]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 1].
+// CHECK-NEXT: luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z0, z1 }[2]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0.h, p0/m, z7.h
+luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0, z7
+luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Wrong striding/registers/index
+
+luti6 { z0.h, z4.h, z8.h, z13.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: registers must have the same sequential stride
+// CHECK-NEXT: luti6 { z0.h, z4.h, z8.h, z13.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z1.h, z2.h, z3.h, z4.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 4 consecutive SVE vectors, where the first vector is a multiple of 4 and with matching element types
+// CHECK-NEXT: luti6 { z1.h, z2.h, z3.h, z4.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z0.b, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix
+// CHECK-NEXT: luti6 { z0.b, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z0.h, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[2]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 1].
+// CHECK-NEXT: luti6 { z0.h, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[2]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0.h, p0/m, z7.h
+luti6 { z0.h, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 { z0.h, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0, z7
+luti6 { z0.h, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 { z0.h, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid registers
+
+luti6 { z0.b - z5.b }, zt0, { z2 - z4 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid number of vectors
+// CHECK-NEXT: luti6 { z0.b - z5.b }, zt0, { z2 - z4 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z0.b - z3.b }, zt0, { z1 - z1 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid number of vectors
+// CHECK-NEXT: luti6 { z0.b - z3.b }, zt0, { z1 - z1 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z0.b - z5.b }, zt0, { z7 - z11 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid number of vectors
+// CHECK-NEXT: luti6 { z0.b - z5.b }, zt0, { z7 - z11 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z0.b - z3.b }, zt1, { z1 - z3 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid lookup table, expected zt0
+// CHECK-NEXT: luti6 { z0.b - z3.b }, zt1, { z1 - z3 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z4.b, z8.b, z12.b, z16.b}, zt0, { z2 - z5 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: luti6 { z4.b, z8.b, z12.b, z16.b}, zt0, { z2 - z5 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z17.b, z21.b, z25.b, z29.b}, zt0, { z2 - z5 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: luti6 { z17.b, z21.b, z25.b, z29.b}, zt0, { z2 - z5 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0.h, p0/m, z7.h
+luti6 { z0.b - z3.b }, zt0, { z1 - z3 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 { z0.b - z3.b }, zt0, { z1 - z3 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0, z7
+luti6 { z0.b - z3.b }, zt0, { z1 - z3 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 { z0.b - z3.b }, zt0, { z1 - z3 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Wrong striding/registers
+
+luti6 { z1.b, z5.b, z9.b, z14.b }, zt0, { z0 - z2 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: registers must have the same sequential stride
+// CHECK-NEXT: luti6 { z1.b, z5.b, z9.b, z14.b }, zt0, { z0 - z2 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z1.b, z2.b, z3.b, z4.b }, zt0, { z0 - z2 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 4 consecutive SVE vectors, where the first vector is a multiple of 4 and with matching element types
+// CHECK-NEXT: luti6 { z1.b, z2.b, z3.b, z4.b }, zt0, { z0 - z2 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z20.b, z24.b, z28.b, z32.b }, zt0, { z0 - z2 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector register expected
+// CHECK-NEXT: luti6 { z20.b, z24.b, z28.b, z32.b }, zt0, { z0 - z2 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z1.h, z5.h, z9.h, z13.h }, zt0, { z0 - z2 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: luti6 { z1.h, z5.h, z9.h, z13.h }, zt0, { z0 - z2 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0.h, p0/m, z7.h
+luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z2 - z4 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z2 - z4 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0, z7
+luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z0 - z2 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z0 - z2 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SME2p3/luti6.s b/llvm/test/MC/AArch64/SME2p3/luti6.s
new file mode 100644
index 0000000..7a7872f
--- /dev/null
+++ b/llvm/test/MC/AArch64/SME2p3/luti6.s
@@ -0,0 +1,472 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p3 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p3 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p3 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sme2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p3 < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p3 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+// ----------------------------------------------------------
+// Lookup table read with 6-bit indices (single)
+
+luti6 z0.b, zt0, z0
+// CHECK-INST: luti6 z0.b, zt0, z0
+// CHECK-ENCODING: encoding: [0x00,0x40,0xc8,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c0c84000 <unknown>
+
+luti6 z31.b, zt0, z0
+// CHECK-INST: luti6 z31.b, zt0, z0
+// CHECK-ENCODING: encoding: [0x1f,0x40,0xc8,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c0c8401f <unknown>
+
+luti6 z0.b, zt0, z31
+// CHECK-INST: luti6 z0.b, zt0, z31
+// CHECK-ENCODING: encoding: [0xe0,0x43,0xc8,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c0c843e0 <unknown>
+
+luti6 z31.b, zt0, z31
+// CHECK-INST: luti6 z31.b, zt0, z31
+// CHECK-ENCODING: encoding: [0xff,0x43,0xc8,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c0c843ff <unknown>
+
+// ----------------------------------------------------------
+// Lookup table read with 6-bit indices (16-bit) - consecutive
+
+luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x00,0xf4,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120f400 <unknown>
+
+luti6 { z8.h - z11.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z8.h - z11.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x08,0xf4,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120f408 <unknown>
+
+luti6 { z20.h - z23.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z20.h - z23.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x14,0xf4,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120f414 <unknown>
+
+luti6 { z28.h - z31.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z28.h - z31.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x1c,0xf4,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120f41c <unknown>
+
+luti6 { z0.h - z3.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z0.h - z3.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xe0,0xf7,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17ff7e0 <unknown>
+
+luti6 { z8.h - z11.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z8.h - z11.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xe8,0xf7,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17ff7e8 <unknown>
+
+luti6 { z20.h - z23.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z20.h - z23.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xf4,0xf7,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17ff7f4 <unknown>
+
+luti6 { z28.h - z31.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z28.h - z31.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xfc,0xf7,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17ff7fc <unknown>
+
+// ----------------------------------------------------------
+// Lookup table read with 6-bit indices (16-bit) - strided
+
+luti6 { z0.h, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z0.h, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x00,0xfc,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120fc00 <unknown>
+
+luti6 { z1.h, z5.h, z9.h, z13.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z1.h, z5.h, z9.h, z13.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x01,0xfc,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120fc01 <unknown>
+
+luti6 { z2.h, z6.h, z10.h, z14.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z2.h, z6.h, z10.h, z14.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x02,0xfc,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120fc02 <unknown>
+
+luti6 { z3.h, z7.h, z11.h, z15.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z3.h, z7.h, z11.h, z15.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x03,0xfc,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120fc03 <unknown>
+
+luti6 { z16.h, z20.h, z24.h, z28.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z16.h, z20.h, z24.h, z28.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x10,0xfc,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120fc10 <unknown>
+
+luti6 { z17.h, z21.h, z25.h, z29.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z17.h, z21.h, z25.h, z29.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x11,0xfc,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120fc11 <unknown>
+
+luti6 { z18.h, z22.h, z26.h, z30.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z18.h, z22.h, z26.h, z30.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x12,0xfc,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120fc12 <unknown>
+
+luti6 { z19.h, z23.h, z27.h, z31.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z19.h, z23.h, z27.h, z31.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x13,0xfc,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120fc13 <unknown>
+
+luti6 { z0.h, z4.h, z8.h, z12.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z0.h, z4.h, z8.h, z12.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xe0,0xff,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17fffe0 <unknown>
+
+luti6 { z1.h, z5.h, z9.h, z13.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z1.h, z5.h, z9.h, z13.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xe1,0xff,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17fffe1 <unknown>
+
+luti6 { z2.h, z6.h, z10.h, z14.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z2.h, z6.h, z10.h, z14.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xe2,0xff,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17fffe2 <unknown>
+
+luti6 { z3.h, z7.h, z11.h, z15.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z3.h, z7.h, z11.h, z15.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xe3,0xff,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17fffe3 <unknown>
+
+luti6 { z16.h, z20.h, z24.h, z28.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z16.h, z20.h, z24.h, z28.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xf0,0xff,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17ffff0 <unknown>
+
+luti6 { z17.h, z21.h, z25.h, z29.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z17.h, z21.h, z25.h, z29.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xf1,0xff,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17ffff1 <unknown>
+
+luti6 { z18.h, z22.h, z26.h, z30.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z18.h, z22.h, z26.h, z30.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xf2,0xff,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17ffff2 <unknown>
+
+luti6 { z19.h, z23.h, z27.h, z31.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z19.h, z23.h, z27.h, z31.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xf3,0xff,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17ffff3 <unknown>
+
+// ----------------------------------------------------------
+// Lookup table read with 6-bit indices (8-bit) - consecutive
+
+luti6 { z8.b - z11.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z8.b - z11.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x08,0x00,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0008 <unknown>
+
+luti6 { z20.b - z23.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z20.b - z23.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x14,0x00,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0014 <unknown>
+
+luti6 { z28.b - z31.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z28.b - z31.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x1c,0x00,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a001c <unknown>
+
+luti6 { z0.b - z3.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z0.b - z3.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x00,0x01,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0100 <unknown>
+
+luti6 { z8.b - z11.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z8.b - z11.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x08,0x01,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0108 <unknown>
+
+luti6 { z20.b - z23.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z20.b - z23.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x14,0x01,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0114 <unknown>
+
+luti6 { z28.b - z31.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z28.b - z31.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x1c,0x01,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a011c <unknown>
+
+luti6 { z0.b - z3.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z0.b - z3.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x80,0x02,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0280 <unknown>
+
+luti6 { z8.b - z11.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z8.b - z11.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x88,0x02,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0288 <unknown>
+
+luti6 { z20.b - z23.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z20.b - z23.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x94,0x02,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0294 <unknown>
+
+luti6 { z28.b - z31.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z28.b - z31.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x9c,0x02,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a029c <unknown>
+
+luti6 { z0.b - z3.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z0.b - z3.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x80,0x03,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0380 <unknown>
+
+luti6 { z8.b - z11.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z8.b - z11.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x88,0x03,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0388 <unknown>
+
+luti6 { z20.b - z23.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z20.b - z23.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x94,0x03,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0394 <unknown>
+
+luti6 { z28.b - z31.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z28.b - z31.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x9c,0x03,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a039c <unknown>
+
+// ----------------------------------------------------------
+// Lookup table read with 6-bit indices (8-bit) - strided
+
+luti6 { z1.b, z5.b, z9.b, z13.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z1.b, z5.b, z9.b, z13.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x01,0x00,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0001 <unknown>
+
+luti6 { z2.b, z6.b, z10.b, z14.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z2.b, z6.b, z10.b, z14.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x02,0x00,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0002 <unknown>
+
+luti6 { z3.b, z7.b, z11.b, z15.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z3.b, z7.b, z11.b, z15.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x03,0x00,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0003 <unknown>
+
+luti6 { z16.b, z20.b, z24.b, z28.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z16.b, z20.b, z24.b, z28.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x10,0x00,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0010 <unknown>
+
+luti6 { z17.b, z21.b, z25.b, z29.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z17.b, z21.b, z25.b, z29.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x11,0x00,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0011 <unknown>
+
+luti6 { z18.b, z22.b, z26.b, z30.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z18.b, z22.b, z26.b, z30.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x12,0x00,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0012 <unknown>
+
+luti6 { z19.b, z23.b, z27.b, z31.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z19.b, z23.b, z27.b, z31.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x13,0x00,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0013 <unknown>
+
+luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x00,0x01,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0100 <unknown>
+
+luti6 { z1.b, z5.b, z9.b, z13.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z1.b, z5.b, z9.b, z13.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x01,0x01,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0101 <unknown>
+
+luti6 { z2.b, z6.b, z10.b, z14.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z2.b, z6.b, z10.b, z14.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x02,0x01,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0102 <unknown>
+
+luti6 { z3.b, z7.b, z11.b, z15.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z3.b, z7.b, z11.b, z15.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x03,0x01,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0103 <unknown>
+
+luti6 { z16.b, z20.b, z24.b, z28.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z16.b, z20.b, z24.b, z28.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x10,0x01,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0110 <unknown>
+
+luti6 { z17.b, z21.b, z25.b, z29.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z17.b, z21.b, z25.b, z29.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x11,0x01,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0111 <unknown>
+
+luti6 { z18.b, z22.b, z26.b, z30.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z18.b, z22.b, z26.b, z30.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x12,0x01,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0112 <unknown>
+
+luti6 { z19.b, z23.b, z27.b, z31.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z19.b, z23.b, z27.b, z31.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x13,0x01,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0113 <unknown>
+
+luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x80,0x02,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0280 <unknown>
+
+luti6 { z1.b, z5.b, z9.b, z13.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z1.b, z5.b, z9.b, z13.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x81,0x02,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0281 <unknown>
+
+luti6 { z2.b, z6.b, z10.b, z14.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z2.b, z6.b, z10.b, z14.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x82,0x02,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0282 <unknown>
+
+luti6 { z3.b, z7.b, z11.b, z15.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z3.b, z7.b, z11.b, z15.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x83,0x02,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0283 <unknown>
+
+luti6 { z16.b, z20.b, z24.b, z28.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z16.b, z20.b, z24.b, z28.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x90,0x02,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0290 <unknown>
+
+luti6 { z17.b, z21.b, z25.b, z29.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z17.b, z21.b, z25.b, z29.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x91,0x02,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0291 <unknown>
+
+luti6 { z18.b, z22.b, z26.b, z30.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z18.b, z22.b, z26.b, z30.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x92,0x02,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0292 <unknown>
+
+luti6 { z19.b, z23.b, z27.b, z31.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z19.b, z23.b, z27.b, z31.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x93,0x02,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0293 <unknown>
+
+luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x80,0x03,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0380 <unknown>
+
+luti6 { z1.b, z5.b, z9.b, z13.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z1.b, z5.b, z9.b, z13.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x81,0x03,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0381 <unknown>
+
+luti6 { z2.b, z6.b, z10.b, z14.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z2.b, z6.b, z10.b, z14.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x82,0x03,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0382 <unknown>
+
+luti6 { z3.b, z7.b, z11.b, z15.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z3.b, z7.b, z11.b, z15.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x83,0x03,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0383 <unknown>
+
+luti6 { z16.b, z20.b, z24.b, z28.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z16.b, z20.b, z24.b, z28.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x90,0x03,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0390 <unknown>
+
+luti6 { z17.b, z21.b, z25.b, z29.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z17.b, z21.b, z25.b, z29.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x91,0x03,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0391 <unknown>
+
+luti6 { z18.b, z22.b, z26.b, z30.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z18.b, z22.b, z26.b, z30.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x92,0x03,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0392 <unknown>
+
+luti6 { z19.b, z23.b, z27.b, z31.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z19.b, z23.b, z27.b, z31.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x93,0x03,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0393 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/bfmmla-diagnostics.s b/llvm/test/MC/AArch64/SVE/bfmmla-diagnostics.s
index 409c2c5..4695b05 100644
--- a/llvm/test/MC/AArch64/SVE/bfmmla-diagnostics.s
+++ b/llvm/test/MC/AArch64/SVE/bfmmla-diagnostics.s
@@ -5,11 +5,6 @@ bfmmla z0.s, z1.s, z2.h
 // CHECK-NEXT: bfmmla z0.s, z1.s, z2.h
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
-bfmmla z0.h, z1.h, z2.h
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
-// CHECK-NEXT: bfmmla z0.h, z1.h, z2.h
-// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
-
 bfmmla z0.s, z1.h, z2.s
 // CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
 // CHECK-NEXT: bfmmla z0.s, z1.h, z2.s
diff --git a/llvm/test/MC/AArch64/SVE2p1/sdot-diagnostics.s b/llvm/test/MC/AArch64/SVE2p1/sdot-diagnostics.s
index 344abb1..42c6ae7 100644
--- a/llvm/test/MC/AArch64/SVE2p1/sdot-diagnostics.s
+++ b/llvm/test/MC/AArch64/SVE2p1/sdot-diagnostics.s
@@ -29,7 +29,7 @@ sdot z0.s, z0.h, z0.h[-1]
 // Invalid vector suffix
 
 sdot z0.h, z0.s, z0.s
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 // CHECK-NEXT: sdot z0.h, z0.s, z0.s
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
diff --git a/llvm/test/MC/AArch64/SVE2p1/udot-diagnostics.s b/llvm/test/MC/AArch64/SVE2p1/udot-diagnostics.s
index 0debddf..aa2ec7f 100644
--- a/llvm/test/MC/AArch64/SVE2p1/udot-diagnostics.s
+++ b/llvm/test/MC/AArch64/SVE2p1/udot-diagnostics.s
@@ -29,11 +29,11 @@ udot z0.s, z0.h, z0.h[-1]
 // Invalid vector suffix
 
 udot z0.h, z0.s, z0.s
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 // CHECK-NEXT: udot z0.h, z0.s, z0.s
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
 udot z0.h, z0.s, z0.s[1]
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 // CHECK-NEXT: udot z0.h, z0.s, z0.s[1]
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SVE2p2/fmmla-diagnostics.s b/llvm/test/MC/AArch64/SVE2p2/fmmla-diagnostics.s
new file mode 100644
index 0000000..e88fcce
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p2/fmmla-diagnostics.s
@@ -0,0 +1,15 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p2,+f16mm 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+fmmla z0.b, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: fmmla z0.b, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0.d, p0/z, z6.d
+fmmla z0.h, z2.h, z3.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a predicated movprfx, suggest using unpredicated movprfx
+// CHECK-NEXT: fmmla z0.h, z2.h, z3.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SVE2p2/fmmla.s b/llvm/test/MC/AArch64/SVE2p2/fmmla.s
new file mode 100644
index 0000000..19929a9
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p2/fmmla.s
@@ -0,0 +1,45 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p2,+f16mm < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p2,+f16mm < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve2p2,+f16mm --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p2,+f16mm < %s \
+// RUN:        | llvm-objdump -d --mattr=-sve2p2,-f16mm --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p2,+f16mm < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sve2p2,+f16mm -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+fmmla z0.h, z0.h, z0.h
+// CHECK-INST: fmmla z0.h, z0.h, z0.h
+// CHECK-ENCODING: encoding: [0x00,0xe0,0xa0,0x64]
+// CHECK-ERROR: instruction requires: f16mm sve2p2
+// CHECK-UNKNOWN: 64a0e000 <unknown>
+
+fmmla z10.h, z10.h, z10.h
+// CHECK-INST: fmmla z10.h, z10.h, z10.h
+// CHECK-ENCODING: encoding: [0x4a,0xe1,0xaa,0x64]
+// CHECK-ERROR: instruction requires: f16mm sve2p2
+// CHECK-UNKNOWN: 64aae14a <unknown>
+
+fmmla z21.h, z21.h, z21.h
+// CHECK-INST: fmmla z21.h, z21.h, z21.h
+// CHECK-ENCODING: encoding: [0xb5,0xe2,0xb5,0x64]
+// CHECK-ERROR: instruction requires: f16mm sve2p2
+// CHECK-UNKNOWN: 64b5e2b5 <unknown>
+
+fmmla z31.h, z31.h, z31.h
+// CHECK-INST: fmmla z31.h, z31.h, z31.h
+// CHECK-ENCODING: encoding: [0xff,0xe3,0xbf,0x64]
+// CHECK-ERROR: instruction requires: f16mm sve2p2
+// CHECK-UNKNOWN: 64bfe3ff <unknown>
+
+movprfx z0, z7
+fmmla z0.h, z1.h, z2.h
+// CHECK-INST: movprfx z0, z7
+// CHECK-INST: fmmla z0.h, z1.h, z2.h
+// CHECK-ENCODING: encoding: [0x20,0xe0,0xa2,0x64]
+// CHECK-ERROR: instruction requires: f16mm sve2p2
+// CHECK-UNKNOWN: 64a2e020 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2p3/arithmetic-diagnostics.s b/llvm/test/MC/AArch64/SVE2p3/arithmetic-diagnostics.s
new file mode 100644
index 0000000..2c0cf07
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/arithmetic-diagnostics.s
@@ -0,0 +1,147 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Test addqp
+
+addqp z0.h, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: addqp z0.h, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+addqp z0.s, z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: addqp z0.s, z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+addqp z0.d, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: addqp z0.d, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+addqp z0.b, z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: addqp z0.b, z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Test addsubp
+
+addsubp z0.h, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: addsubp z0.h, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+addsubp z0.s, z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: addsubp z0.s, z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+addsubp z0.d, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: addsubp z0.d, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+addsubp z0.b, z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: addsubp z0.b, z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Test sabal
+
+sabal z0.b, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sabal z0.b, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sabal z0.h, z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sabal z0.h, z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sabal z0.s, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sabal z0.s, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sabal z0.d, z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sabal z0.d, z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Test uabal
+
+uabal z0.b, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: uabal z0.b, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uabal z0.h, z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: uabal z0.h, z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uabal z0.s, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: uabal z0.s, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uabal z0.d, z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: uabal z0.d, z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Test subp
+
+subp z0.h, p0/m, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: subp z0.h, p0/m, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+subp z0.s, p0/m, z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: subp z0.s, p0/m, z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+subp z0.d, p0/m, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: subp z0.d, p0/m, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+subp z0.b, p0/m, z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: subp z0.b, p0/m, z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Predicate not in restricted predicate range
+
+subp z0.h, p8/m, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix)
+// CHECK-NEXT: subp z0.h, p8/m, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Operand must match destination register
+
+subp z0.b, p0/m, z1.b, z2.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: operand must match destination register
+// CHECK-NEXT: subp z0.b, p0/m, z1.b, z2.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+addqp z0.b, z1.b, z2.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: addqp z0.b, z1.b, z2.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0, z7
+addsubp z0.b, z1.b, z2.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: addsubp z0.b, z1.b, z2.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SVE2p3/arithmetic.s b/llvm/test/MC/AArch64/SVE2p3/arithmetic.s
new file mode 100644
index 0000000..12df18d
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/arithmetic.s
@@ -0,0 +1,275 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p3 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sve2p3 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+addqp z0.b, z0.b, z0.b
+// CHECK-INST: addqp z0.b, z0.b, z0.b
+// CHECK-ENCODING: encoding: [0x00,0x78,0x20,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04207800 <unknown>
+
+addqp z31.b, z31.b, z31.b
+// CHECK-INST: addqp z31.b, z31.b, z31.b
+// CHECK-ENCODING: encoding: [0xff,0x7b,0x3f,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 043f7bff <unknown>
+
+addqp z0.h, z0.h, z0.h
+// CHECK-INST: addqp z0.h, z0.h, z0.h
+// CHECK-ENCODING: encoding: [0x00,0x78,0x60,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04607800 <unknown>
+
+addqp z31.h, z31.h, z31.h
+// CHECK-INST: addqp z31.h, z31.h, z31.h
+// CHECK-ENCODING: encoding: [0xff,0x7b,0x7f,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 047f7bff <unknown>
+
+addqp z0.s, z0.s, z0.s
+// CHECK-INST: addqp z0.s, z0.s, z0.s
+// CHECK-ENCODING: encoding: [0x00,0x78,0xa0,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04a07800 <unknown>
+
+addqp z31.s, z31.s, z31.s
+// CHECK-INST: addqp z31.s, z31.s, z31.s
+// CHECK-ENCODING: encoding: [0xff,0x7b,0xbf,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04bf7bff <unknown>
+
+addqp z0.d, z0.d, z0.d
+// CHECK-INST: addqp z0.d, z0.d, z0.d
+// CHECK-ENCODING: encoding: [0x00,0x78,0xe0,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04e07800 <unknown>
+
+addqp z31.d, z31.d, z31.d
+// CHECK-INST: addqp z31.d, z31.d, z31.d
+// CHECK-ENCODING: encoding: [0xff,0x7b,0xff,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04ff7bff <unknown>
+
+// --------------------------------------------------------------------------//
+// Test addsubp
+
+addsubp z0.b, z0.b, z0.b
+// CHECK-INST: addsubp z0.b, z0.b, z0.b
+// CHECK-ENCODING: encoding: [0x00,0x7c,0x20,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04207c00 <unknown>
+
+addsubp z31.b, z31.b, z31.b
+// CHECK-INST: addsubp z31.b, z31.b, z31.b
+// CHECK-ENCODING: encoding: [0xff,0x7f,0x3f,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 043f7fff <unknown>
+
+addsubp z0.h, z0.h, z0.h
+// CHECK-INST: addsubp z0.h, z0.h, z0.h
+// CHECK-ENCODING: encoding: [0x00,0x7c,0x60,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04607c00 <unknown>
+
+addsubp z31.h, z31.h, z31.h
+// CHECK-INST: addsubp z31.h, z31.h, z31.h
+// CHECK-ENCODING: encoding: [0xff,0x7f,0x7f,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 047f7fff <unknown>
+
+addsubp z0.s, z0.s, z0.s
+// CHECK-INST: addsubp z0.s, z0.s, z0.s
+// CHECK-ENCODING: encoding: [0x00,0x7c,0xa0,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04a07c00 <unknown>
+
+addsubp z31.s, z31.s, z31.s
+// CHECK-INST: addsubp z31.s, z31.s, z31.s
+// CHECK-ENCODING: encoding: [0xff,0x7f,0xbf,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04bf7fff <unknown>
+
+addsubp z0.d, z0.d, z0.d
+// CHECK-INST: addsubp z0.d, z0.d, z0.d
+// CHECK-ENCODING: encoding: [0x00,0x7c,0xe0,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04e07c00 <unknown>
+
+addsubp z31.d, z31.d, z31.d
+// CHECK-INST: addsubp z31.d, z31.d, z31.d
+// CHECK-ENCODING: encoding: [0xff,0x7f,0xff,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04ff7fff <unknown>
+
+// --------------------------------------------------------------------------//
+// Test sabal
+
+sabal z0.h, z0.b, z0.b
+// CHECK-INST: sabal z0.h, z0.b, z0.b
+// CHECK-ENCODING: encoding: [0x00,0xd4,0x40,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4440d400 <unknown>
+
+sabal z31.h, z31.b, z31.b
+// CHECK-INST: sabal z31.h, z31.b, z31.b
+// CHECK-ENCODING: encoding: [0xff,0xd7,0x5f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 445fd7ff <unknown>
+
+sabal z0.s, z0.h, z0.h
+// CHECK-INST: sabal z0.s, z0.h, z0.h
+// CHECK-ENCODING: encoding: [0x00,0xd4,0x80,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4480d400 <unknown>
+
+sabal z31.s, z31.h, z31.h
+// CHECK-INST: sabal z31.s, z31.h, z31.h
+// CHECK-ENCODING: encoding: [0xff,0xd7,0x9f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 449fd7ff <unknown>
+
+sabal z0.d, z0.s, z0.s
+// CHECK-INST: sabal z0.d, z0.s, z0.s
+// CHECK-ENCODING: encoding: [0x00,0xd4,0xc0,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44c0d400 <unknown>
+
+sabal z31.d, z31.s, z31.s
+// CHECK-INST: sabal z31.d, z31.s, z31.s
+// CHECK-ENCODING: encoding: [0xff,0xd7,0xdf,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44dfd7ff <unknown>
+
+movprfx z0, z7
+sabal	z0.h, z1.b, z2.b
+// CHECK-INST: movprfx z0, z7
+// CHECK-INST: sabal z0.h, z1.b, z2.b
+// CHECK-ENCODING: encoding: [0x20,0xd4,0x42,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4442d420 <unknown>
+
+// --------------------------------------------------------------------------//
+// Test uabal
+
+uabal z0.h, z0.b, z0.b
+// CHECK-INST: uabal z0.h, z0.b, z0.b
+// CHECK-ENCODING: encoding: [0x00,0xdc,0x40,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4440dc00 <unknown>
+
+uabal z31.h, z31.b, z31.b
+// CHECK-INST: uabal z31.h, z31.b, z31.b
+// CHECK-ENCODING: encoding: [0xff,0xdf,0x5f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 445fdfff <unknown>
+
+uabal z0.s, z0.h, z0.h
+// CHECK-INST: uabal z0.s, z0.h, z0.h
+// CHECK-ENCODING: encoding: [0x00,0xdc,0x80,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4480dc00 <unknown>
+
+uabal z31.s, z31.h, z31.h
+// CHECK-INST: uabal z31.s, z31.h, z31.h
+// CHECK-ENCODING: encoding: [0xff,0xdf,0x9f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 449fdfff <unknown>
+
+uabal z0.d, z0.s, z0.s
+// CHECK-INST: uabal z0.d, z0.s, z0.s
+// CHECK-ENCODING: encoding: [0x00,0xdc,0xc0,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44c0dc00 <unknown>
+
+uabal z31.d, z31.s, z31.s
+// CHECK-INST: uabal z31.d, z31.s, z31.s
+// CHECK-ENCODING: encoding: [0xff,0xdf,0xdf,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44dfdfff <unknown>
+
+movprfx z0, z7
+uabal	z0.h, z1.b, z2.b
+// CHECK-INST: movprfx z0, z7
+// CHECK-INST: uabal z0.h, z1.b, z2.b
+// CHECK-ENCODING: encoding: [0x20,0xdc,0x42,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4442dc20 <unknown>
+
+// --------------------------------------------------------------------------//
+// Test subp
+
+subp z0.b, p0/m, z0.b, z0.b
+// CHECK-INST: subp z0.b, p0/m, z0.b, z0.b
+// CHECK-ENCODING: encoding: [0x00,0xa0,0x10,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4410a000 <unknown>
+
+subp z31.b, p7/m, z31.b, z31.b
+// CHECK-INST: subp z31.b, p7/m, z31.b, z31.b
+// CHECK-ENCODING: encoding: [0xff,0xbf,0x10,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4410bfff <unknown>
+
+subp z0.h, p0/m, z0.h, z0.h
+// CHECK-INST: subp z0.h, p0/m, z0.h, z0.h
+// CHECK-ENCODING: encoding: [0x00,0xa0,0x50,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4450a000 <unknown>
+
+subp z31.h, p7/m, z31.h, z31.h
+// CHECK-INST: subp z31.h, p7/m, z31.h, z31.h
+// CHECK-ENCODING: encoding: [0xff,0xbf,0x50,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4450bfff <unknown>
+
+subp z0.s, p0/m, z0.s, z0.s
+// CHECK-INST: subp z0.s, p0/m, z0.s, z0.s
+// CHECK-ENCODING: encoding: [0x00,0xa0,0x90,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4490a000 <unknown>
+
+subp z31.s, p7/m, z31.s, z31.s
+// CHECK-INST: subp z31.s, p7/m, z31.s, z31.s
+// CHECK-ENCODING: encoding: [0xff,0xbf,0x90,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4490bfff <unknown>
+
+subp z0.d, p0/m, z0.d, z0.d
+// CHECK-INST: subp z0.d, p0/m, z0.d, z0.d
+// CHECK-ENCODING: encoding: [0x00,0xa0,0xd0,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44d0a000 <unknown>
+
+subp z31.d, p7/m, z31.d, z31.d
+// CHECK-INST: subp z31.d, p7/m, z31.d, z31.d
+// CHECK-ENCODING: encoding: [0xff,0xbf,0xd0,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44d0bfff <unknown>
+
+movprfx z0.b, p0/m, z7.b
+subp	z0.b, p0/m, z0.b, z1.b
+// CHECK-INST: movprfx z0.b, p0/m, z7.b
+// CHECK-INST: subp z0.b, p0/m, z0.b, z1.b
+// CHECK-ENCODING: encoding: [0x20,0xa0,0x10,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4410a020 <unknown>
+
+movprfx z0, z7
+subp	z0.b, p0/m, z0.b, z1.b
+// CHECK-INST: movprfx z0, z7
+// CHECK-INST: subp z0.b, p0/m, z0.b, z1.b
+// CHECK-ENCODING: encoding: [0x20,0xa0,0x10,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4410a020 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2p3/bfmmla-diagnostics.s b/llvm/test/MC/AArch64/SVE2p3/bfmmla-diagnostics.s
new file mode 100644
index 0000000..28ec78d
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/bfmmla-diagnostics.s
@@ -0,0 +1,19 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve-b16mm 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+bfmmla z0.h, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: bfmmla z0.h, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bfmmla z0.s, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: bfmmla z0.s, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bfmmla z0.d, z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: bfmmla z0.d, z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SVE2p3/bfmmla.s b/llvm/test/MC/AArch64/SVE2p3/bfmmla.s
new file mode 100644
index 0000000..77440ee
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/bfmmla.s
@@ -0,0 +1,45 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve-b16mm < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve-b16mm < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve-b16mm --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve-b16mm < %s \
+// RUN:        | llvm-objdump -d --mattr=-sve-b16mm --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve-b16mm < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sve-b16mm -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+bfmmla z0.h, z0.h, z0.h
+// CHECK-INST: bfmmla z0.h, z0.h, z0.h
+// CHECK-ENCODING: encoding: [0x00,0xe0,0xe0,0x64]
+// CHECK-ERROR: instruction requires: sve-b16mm
+// CHECK-UNKNOWN: 64e0e000 <unknown>
+
+bfmmla z10.h, z10.h, z10.h
+// CHECK-INST: bfmmla z10.h, z10.h, z10.h
+// CHECK-ENCODING: encoding: [0x4a,0xe1,0xea,0x64]
+// CHECK-ERROR: instruction requires: sve-b16mm
+// CHECK-UNKNOWN: 64eae14a <unknown>
+
+bfmmla z21.h, z21.h, z21.h
+// CHECK-INST: bfmmla z21.h, z21.h, z21.h
+// CHECK-ENCODING: encoding: [0xb5,0xe2,0xf5,0x64]
+// CHECK-ERROR: instruction requires: sve-b16mm
+// CHECK-UNKNOWN: 64f5e2b5 <unknown>
+
+bfmmla z31.h, z31.h, z31.h
+// CHECK-INST: bfmmla z31.h, z31.h, z31.h
+// CHECK-ENCODING: encoding: [0xff,0xe3,0xff,0x64]
+// CHECK-ERROR: instruction requires: sve-b16mm
+// CHECK-UNKNOWN: 64ffe3ff <unknown>
+
+movprfx z0, z7
+bfmmla z0.h, z1.h, z2.h
+// CHECK-INST: movprfx z0, z7
+// CHECK-INST: bfmmla z0.h, z1.h, z2.h
+// CHECK-ENCODING: encoding: [0x20,0xe0,0xe2,0x64]
+// CHECK-ERROR: instruction requires: sve-b16mm
+// CHECK-UNKNOWN: 64e2e020 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2p3/cvt-diagnostics.s b/llvm/test/MC/AArch64/SVE2p3/cvt-diagnostics.s
new file mode 100644
index 0000000..68a50ab
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/cvt-diagnostics.s
@@ -0,0 +1,193 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid operand for instruction
+
+fcvtzsn z0.b, { z0.b, z1.b }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fcvtzsn z0.b, { z0.b, z1.b }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvtzsn z0.h, { z0.h, z1.h }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fcvtzsn z0.h, { z0.h, z1.h }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvtzsn z0.s, { z0.s, z1.s }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fcvtzsn z0.s, { z0.s, z1.s }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvtzsn z0.b, { z1.h, z2.h }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: fcvtzsn z0.b, { z1.h, z2.h }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+fcvtzsn z0.b, { z2.h, z3.h }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: fcvtzsn z0.b, { z2.h, z3.h }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid operand for instruction
+
+fcvtzun z0.b, { z0.b, z1.b }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fcvtzun z0.b, { z0.b, z1.b }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvtzun z0.h, { z0.h, z1.h }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fcvtzun z0.h, { z0.h, z1.h }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvtzun z0.s, { z0.s, z1.s }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fcvtzun z0.s, { z0.s, z1.s }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvtzun z0.b, { z1.h, z2.h }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: fcvtzun z0.b, { z1.h, z2.h }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+fcvtzun z0.b, { z2.h, z3.h }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: fcvtzun z0.b, { z2.h, z3.h }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+scvtf z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: scvtf z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+scvtf z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: scvtf z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+scvtf z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: scvtf z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+scvtf z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: scvtf z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+scvtf z0.h, z1.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: scvtf z0.h, z1.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+scvtflt z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: scvtflt z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+scvtflt z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: scvtflt z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+scvtflt z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: scvtflt z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+scvtflt z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: scvtflt z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+scvtflt z0.h, z1.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: scvtflt z0.h, z1.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+ucvtf z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: ucvtf z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+ucvtf z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: ucvtf z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+ucvtf z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: ucvtf z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+ucvtf z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: ucvtf z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+ucvtf z0.h, z1.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: ucvtf z0.h, z1.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+ucvtflt z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: ucvtflt z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+ucvtflt z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: ucvtflt z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+ucvtflt z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: ucvtflt z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+ucvtflt z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: ucvtflt z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+ucvtflt z0.h, z1.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: ucvtflt z0.h, z1.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SVE2p3/cvt.s b/llvm/test/MC/AArch64/SVE2p3/cvt.s
new file mode 100644
index 0000000..da9c463
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/cvt.s
@@ -0,0 +1,321 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p3 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sve2p3 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+// -------------------------------------------------------------
+// Floating-point convert, narrow and interleave to signed integer, rounding toward zero
+
+fcvtzsn z0.b, { z0.h, z1.h }
+// CHECK-INST: fcvtzsn z0.b, { z0.h, z1.h }
+// CHECK-ENCODING: encoding: [0x00,0x30,0x4d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654d3000 <unknown>
+
+fcvtzsn z31.b, { z0.h, z1.h }
+// CHECK-INST: fcvtzsn z31.b, { z0.h, z1.h }
+// CHECK-ENCODING: encoding: [0x1f,0x30,0x4d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654d301f <unknown>
+
+fcvtzsn z0.b, { z30.h, z31.h }
+// CHECK-INST: fcvtzsn z0.b, { z30.h, z31.h }
+// CHECK-ENCODING: encoding: [0xc0,0x33,0x4d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654d33c0 <unknown>
+
+fcvtzsn z31.b, { z30.h, z31.h }
+// CHECK-INST: fcvtzsn z31.b, { z30.h, z31.h }
+// CHECK-ENCODING: encoding: [0xdf,0x33,0x4d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654d33df <unknown>
+
+fcvtzsn z0.h, { z0.s, z1.s }
+// CHECK-INST: fcvtzsn z0.h, { z0.s, z1.s }
+// CHECK-ENCODING: encoding: [0x00,0x30,0x8d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658d3000 <unknown>
+
+fcvtzsn z31.h, { z0.s, z1.s }
+// CHECK-INST: fcvtzsn z31.h, { z0.s, z1.s }
+// CHECK-ENCODING: encoding: [0x1f,0x30,0x8d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658d301f <unknown>
+
+fcvtzsn z0.h, { z30.s, z31.s }
+// CHECK-INST: fcvtzsn z0.h, { z30.s, z31.s }
+// CHECK-ENCODING: encoding: [0xc0,0x33,0x8d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658d33c0 <unknown>
+
+fcvtzsn z31.h, { z30.s, z31.s }
+// CHECK-INST: fcvtzsn z31.h, { z30.s, z31.s }
+// CHECK-ENCODING: encoding: [0xdf,0x33,0x8d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658d33df <unknown>
+
+fcvtzsn z0.s, { z0.d, z1.d }
+// CHECK-INST: fcvtzsn z0.s, { z0.d, z1.d }
+// CHECK-ENCODING: encoding: [0x00,0x30,0xcd,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cd3000 <unknown>
+
+fcvtzsn z31.s, { z0.d, z1.d }
+// CHECK-INST: fcvtzsn z31.s, { z0.d, z1.d }
+// CHECK-ENCODING: encoding: [0x1f,0x30,0xcd,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cd301f <unknown>
+
+fcvtzsn z0.s, { z30.d, z31.d }
+// CHECK-INST: fcvtzsn z0.s, { z30.d, z31.d }
+// CHECK-ENCODING: encoding: [0xc0,0x33,0xcd,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cd33c0 <unknown>
+
+fcvtzsn z31.s, { z30.d, z31.d }
+// CHECK-INST: fcvtzsn z31.s, { z30.d, z31.d }
+// CHECK-ENCODING: encoding: [0xdf,0x33,0xcd,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cd33df <unknown>
+
+// -------------------------------------------------------------
+// Floating-point convert, narrow and interleave to unsigned integer, rounding toward zero
+
+fcvtzun z0.b, { z0.h, z1.h }
+// CHECK-INST: fcvtzun z0.b, { z0.h, z1.h }
+// CHECK-ENCODING: encoding: [0x00,0x34,0x4d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654d3400 <unknown>
+
+fcvtzun z31.b, { z0.h, z1.h }
+// CHECK-INST: fcvtzun z31.b, { z0.h, z1.h }
+// CHECK-ENCODING: encoding: [0x1f,0x34,0x4d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654d341f <unknown>
+
+fcvtzun z0.b, { z30.h, z31.h }
+// CHECK-INST: fcvtzun z0.b, { z30.h, z31.h }
+// CHECK-ENCODING: encoding: [0xc0,0x37,0x4d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654d37c0 <unknown>
+
+fcvtzun z31.b, { z30.h, z31.h }
+// CHECK-INST: fcvtzun z31.b, { z30.h, z31.h }
+// CHECK-ENCODING: encoding: [0xdf,0x37,0x4d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654d37df <unknown>
+
+fcvtzun z0.h, { z0.s, z1.s }
+// CHECK-INST: fcvtzun z0.h, { z0.s, z1.s }
+// CHECK-ENCODING: encoding: [0x00,0x34,0x8d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658d3400 <unknown>
+
+fcvtzun z31.h, { z0.s, z1.s }
+// CHECK-INST: fcvtzun z31.h, { z0.s, z1.s }
+// CHECK-ENCODING: encoding: [0x1f,0x34,0x8d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658d341f <unknown>
+
+fcvtzun z0.h, { z30.s, z31.s }
+// CHECK-INST: fcvtzun z0.h, { z30.s, z31.s }
+// CHECK-ENCODING: encoding: [0xc0,0x37,0x8d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658d37c0 <unknown>
+
+fcvtzun z31.h, { z30.s, z31.s }
+// CHECK-INST: fcvtzun z31.h, { z30.s, z31.s }
+// CHECK-ENCODING: encoding: [0xdf,0x37,0x8d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658d37df <unknown>
+
+fcvtzun z0.s, { z0.d, z1.d }
+// CHECK-INST: fcvtzun z0.s, { z0.d, z1.d }
+// CHECK-ENCODING: encoding: [0x00,0x34,0xcd,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cd3400 <unknown>
+
+fcvtzun z31.s, { z0.d, z1.d }
+// CHECK-INST: fcvtzun z31.s, { z0.d, z1.d }
+// CHECK-ENCODING: encoding: [0x1f,0x34,0xcd,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cd341f <unknown>
+
+fcvtzun z0.s, { z30.d, z31.d }
+// CHECK-INST: fcvtzun z0.s, { z30.d, z31.d }
+// CHECK-ENCODING: encoding: [0xc0,0x37,0xcd,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cd37c0 <unknown>
+
+fcvtzun z31.s, { z30.d, z31.d }
+// CHECK-INST: fcvtzun z31.s, { z30.d, z31.d }
+// CHECK-ENCODING: encoding: [0xdf,0x37,0xcd,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cd37df <unknown>
+
+// -----------------------------------------------------------------------
+// Signed integer convert to floating-point (bottom, unpredicated)
+
+scvtf z0.h, z0.b
+// CHECK-INST: scvtf z0.h, z0.b
+// CHECK-ENCODING: encoding: [0x00,0x30,0x4c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654c3000 <unknown>
+
+scvtf z31.h, z31.b
+// CHECK-INST: scvtf z31.h, z31.b
+// CHECK-ENCODING: encoding: [0xff,0x33,0x4c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654c33ff <unknown>
+
+scvtf z0.s, z0.h
+// CHECK-INST: scvtf z0.s, z0.h
+// CHECK-ENCODING: encoding: [0x00,0x30,0x8c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658c3000 <unknown>
+
+scvtf z31.s, z31.h
+// CHECK-INST: scvtf z31.s, z31.h
+// CHECK-ENCODING: encoding: [0xff,0x33,0x8c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658c33ff <unknown>
+
+scvtf z0.d, z0.s
+// CHECK-INST: scvtf z0.d, z0.s
+// CHECK-ENCODING: encoding: [0x00,0x30,0xcc,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cc3000 <unknown>
+
+scvtf z31.d, z31.s
+// CHECK-INST: scvtf z31.d, z31.s
+// CHECK-ENCODING: encoding: [0xff,0x33,0xcc,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cc33ff <unknown>
+
+// -----------------------------------------------------------------------
+// Signed integer convert to floating-point (top, unpredicated)
+
+scvtflt z0.h, z0.b
+// CHECK-INST: scvtflt z0.h, z0.b
+// CHECK-ENCODING: encoding: [0x00,0x38,0x4c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654c3800 <unknown>
+
+scvtflt z31.h, z31.b
+// CHECK-INST: scvtflt z31.h, z31.b
+// CHECK-ENCODING: encoding: [0xff,0x3b,0x4c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654c3bff <unknown>
+
+scvtflt z0.s, z0.h
+// CHECK-INST: scvtflt z0.s, z0.h
+// CHECK-ENCODING: encoding: [0x00,0x38,0x8c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658c3800 <unknown>
+
+scvtflt z31.s, z31.h
+// CHECK-INST: scvtflt z31.s, z31.h
+// CHECK-ENCODING: encoding: [0xff,0x3b,0x8c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658c3bff <unknown>
+
+scvtflt z0.d, z0.s
+// CHECK-INST: scvtflt z0.d, z0.s
+// CHECK-ENCODING: encoding: [0x00,0x38,0xcc,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cc3800 <unknown>
+
+scvtflt z31.d, z31.s
+// CHECK-INST: scvtflt z31.d, z31.s
+// CHECK-ENCODING: encoding: [0xff,0x3b,0xcc,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cc3bff <unknown>
+
+// -----------------------------------------------------------------------
+// Unsigned integer convert to floating-point (bottom, unpredicated)
+
+ucvtf z0.h, z0.b
+// CHECK-INST: ucvtf z0.h, z0.b
+// CHECK-ENCODING: encoding: [0x00,0x34,0x4c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654c3400 <unknown>
+
+ucvtf z31.h, z31.b
+// CHECK-INST: ucvtf z31.h, z31.b
+// CHECK-ENCODING: encoding: [0xff,0x37,0x4c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654c37ff <unknown>
+
+ucvtf z0.s, z0.h
+// CHECK-INST: ucvtf z0.s, z0.h
+// CHECK-ENCODING: encoding: [0x00,0x34,0x8c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658c3400 <unknown>
+
+ucvtf z31.s, z31.h
+// CHECK-INST: ucvtf z31.s, z31.h
+// CHECK-ENCODING: encoding: [0xff,0x37,0x8c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658c37ff <unknown>
+
+ucvtf z0.d, z0.s
+// CHECK-INST: ucvtf z0.d, z0.s
+// CHECK-ENCODING: encoding: [0x00,0x34,0xcc,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cc3400 <unknown>
+
+ucvtf z31.d, z31.s
+// CHECK-INST: ucvtf z31.d, z31.s
+// CHECK-ENCODING: encoding: [0xff,0x37,0xcc,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cc37ff <unknown>
+
+// -----------------------------------------------------------------------
+// Unsigned integer convert to floating-point (top, unpredicated)
+
+ucvtflt z0.h, z0.b
+// CHECK-INST: ucvtflt z0.h, z0.b
+// CHECK-ENCODING: encoding: [0x00,0x3c,0x4c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654c3c00 <unknown>
+
+ucvtflt z31.h, z31.b
+// CHECK-INST: ucvtflt z31.h, z31.b
+// CHECK-ENCODING: encoding: [0xff,0x3f,0x4c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654c3fff <unknown>
+
+ucvtflt z0.s, z0.h
+// CHECK-INST: ucvtflt z0.s, z0.h
+// CHECK-ENCODING: encoding: [0x00,0x3c,0x8c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658c3c00 <unknown>
+
+ucvtflt z31.s, z31.h
+// CHECK-INST: ucvtflt z31.s, z31.h
+// CHECK-ENCODING: encoding: [0xff,0x3f,0x8c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658c3fff <unknown>
+
+ucvtflt z0.d, z0.s
+// CHECK-INST: ucvtflt z0.d, z0.s
+// CHECK-ENCODING: encoding: [0x00,0x3c,0xcc,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cc3c00 <unknown>
+
+ucvtflt z31.d, z31.s
+// CHECK-INST: ucvtflt z31.d, z31.s
+// CHECK-ENCODING: encoding: [0xff,0x3f,0xcc,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cc3fff <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2p3/directive-arch-negative.s b/llvm/test/MC/AArch64/SVE2p3/directive-arch-negative.s
new file mode 100644
index 0000000..0a12cf8
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/directive-arch-negative.s
@@ -0,0 +1,7 @@
+// RUN: not llvm-mc -triple aarch64 -filetype asm -o - %s 2>&1 | FileCheck %s
+
+.arch armv9-a+sve2p3
+.arch armv9-a+nosve2p3
+addqp z0.b, z0.b, z0.b
+// CHECK: error: instruction requires: sme2p3 or sve2p3
+// CHECK-NEXT: addqp z0.b, z0.b, z0.b
diff --git a/llvm/test/MC/AArch64/SVE2p3/directive-arch_extension-negative.s b/llvm/test/MC/AArch64/SVE2p3/directive-arch_extension-negative.s
new file mode 100644
index 0000000..1af6245
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/directive-arch_extension-negative.s
@@ -0,0 +1,7 @@
+// RUN: not llvm-mc -triple aarch64 -filetype asm -o - %s 2>&1 | FileCheck %s
+
+.arch_extension sve2p3
+.arch_extension nosve2p3
+addqp z0.b, z0.b, z0.b
+// CHECK: error: instruction requires: sme2p3 or sve2p3
+// CHECK-NEXT: addqp z0.b, z0.b, z0.b
diff --git a/llvm/test/MC/AArch64/SVE2p3/directive-cpu-negative.s b/llvm/test/MC/AArch64/SVE2p3/directive-cpu-negative.s
new file mode 100644
index 0000000..f3dac04
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/directive-cpu-negative.s
@@ -0,0 +1,7 @@
+// RUN: not llvm-mc -triple aarch64 -filetype asm -o - %s 2>&1 | FileCheck %s
+
+.cpu generic+sve2p3
+.cpu generic+nosve2p3
+addqp z0.b, z0.b, z0.b
+// CHECK: error: instruction requires: sme2p3 or sve2p3
+// CHECK-NEXT: addqp z0.b, z0.b, z0.b
diff --git a/llvm/test/MC/AArch64/SVE2p3/dot-diagnostics.s b/llvm/test/MC/AArch64/SVE2p3/dot-diagnostics.s
new file mode 100644
index 0000000..3eb3792
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/dot-diagnostics.s
@@ -0,0 +1,137 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+sdot z0.b, z0.b, z0.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.b, z0.b, z0.b[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.h, z0.h, z0.h[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.h, z0.h, z0.h[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.s, z0.s, z0.s[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.s, z0.s, z0.s[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.d, z0.d, z0.d[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.d, z0.d, z0.d[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.d, z0.s, z0.s[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.d, z0.s, z0.s[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.b, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.b, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.h, z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.h, z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.s, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.s, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.d, z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.d, z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.d, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.d, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.b, z0.b, z0.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.b, z0.b, z0.b[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.h, z0.h, z0.h[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.h, z0.h, z0.h[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.s, z0.s, z0.s[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.s, z0.s, z0.s[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.d, z0.d, z0.d[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.d, z0.d, z0.d[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.d, z0.s, z0.s[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.d, z0.s, z0.s[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.b, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.b, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.h, z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.h, z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.s, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.s, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.d, z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.d, z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.d, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.d, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid register range and index
+
+sdot z0.h, z0.b, z8.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: sdot z0.h, z0.b, z8.b[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.h, z0.b, z0.b[-1]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 7].
+// CHECK-NEXT: sdot z0.h, z0.b, z0.b[-1]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.h, z0.b, z0.b[8]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 7].
+// CHECK-NEXT: sdot z0.h, z0.b, z0.b[8]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.h, z0.b, z8.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: udot z0.h, z0.b, z8.b[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.h, z0.b, z0.b[-1]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 7].
+// CHECK-NEXT: udot z0.h, z0.b, z0.b[-1]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.h, z0.b, z0.b[8]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 7].
+// CHECK-NEXT: udot z0.h, z0.b, z0.b[8]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SVE2p3/dot.s b/llvm/test/MC/AArch64/SVE2p3/dot.s
new file mode 100644
index 0000000..01021b38
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/dot.s
@@ -0,0 +1,173 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p3 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sve2p3 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+sdot z0.h, z0.b, z0.b
+// CHECK-INST: sdot z0.h, z0.b, z0.b
+// CHECK-ENCODING: encoding: [0x00,0x00,0x40,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44400000 <unknown>
+
+sdot z10.h, z10.b, z10.b
+// CHECK-INST: sdot z10.h, z10.b, z10.b
+// CHECK-ENCODING: encoding: [0x4a,0x01,0x4a,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 444a014a <unknown>
+
+sdot z21.h, z21.b, z21.b
+// CHECK-INST: sdot z21.h, z21.b, z21.b
+// CHECK-ENCODING: encoding: [0xb5,0x02,0x55,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 445502b5 <unknown>
+
+sdot z31.h, z31.b, z31.b
+// CHECK-INST: sdot z31.h, z31.b, z31.b
+// CHECK-ENCODING: encoding: [0xff,0x03,0x5f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 445f03ff <unknown>
+
+movprfx z0, z7
+sdot z0.h, z1.b, z2.b
+// CHECK-INST: movprfx z0, z7
+// CHECK-INST: sdot z0.h, z1.b, z2.b
+// CHECK-ENCODING: encoding: [0x20,0x00,0x42,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44420020 <unknown>
+
+// sdot indexed
+
+sdot z0.h, z0.b, z0.b[0]
+// CHECK-INST: sdot z0.h, z0.b, z0.b[0]
+// CHECK-ENCODING: encoding: [0x00,0x00,0x20,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44200000 <unknown>
+
+sdot z31.h, z31.b, z7.b[0]
+// CHECK-INST: sdot z31.h, z31.b, z7.b[0]
+// CHECK-ENCODING: encoding: [0xff,0x03,0x27,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 442703ff <unknown>
+
+sdot z0.h, z0.b, z0.b[1]
+// CHECK-INST: sdot z0.h, z0.b, z0.b[1]
+// CHECK-ENCODING: encoding: [0x00,0x00,0x28,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44280000 <unknown>
+
+sdot z31.h, z31.b, z7.b[1]
+// CHECK-INST: sdot z31.h, z31.b, z7.b[1]
+// CHECK-ENCODING: encoding: [0xff,0x03,0x2f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 442f03ff <unknown>
+
+sdot z0.h, z0.b, z0.b[7]
+// CHECK-INST: sdot z0.h, z0.b, z0.b[7]
+// CHECK-ENCODING: encoding: [0x00,0x00,0x78,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44780000 <unknown>
+
+sdot z31.h, z31.b, z7.b[7]
+// CHECK-INST: sdot z31.h, z31.b, z7.b[7]
+// CHECK-ENCODING: encoding: [0xff,0x03,0x7f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 447f03ff <unknown>
+
+movprfx z0, z7
+sdot z0.h, z1.b, z2.b[0]
+// CHECK-INST: movprfx z0, z7
+// CHECK-INST: sdot z0.h, z1.b, z2.b[0]
+// CHECK-ENCODING: encoding: [0x20,0x00,0x22,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44220020 <unknown>
+
+// udot
+
+udot z0.h, z0.b, z0.b
+// CHECK-INST: udot z0.h, z0.b, z0.b
+// CHECK-ENCODING: encoding: [0x00,0x04,0x40,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44400400 <unknown>
+
+udot z10.h, z10.b, z10.b
+// CHECK-INST: udot z10.h, z10.b, z10.b
+// CHECK-ENCODING: encoding: [0x4a,0x05,0x4a,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 444a054a <unknown>
+
+udot z21.h, z21.b, z21.b
+// CHECK-INST: udot z21.h, z21.b, z21.b
+// CHECK-ENCODING: encoding: [0xb5,0x06,0x55,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 445506b5 <unknown>
+
+udot z31.h, z31.b, z31.b
+// CHECK-INST: udot z31.h, z31.b, z31.b
+// CHECK-ENCODING: encoding: [0xff,0x07,0x5f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 445f07ff <unknown>
+
+movprfx z0, z7
+udot z0.h, z1.b, z2.b
+// CHECK-INST: movprfx z0, z7
+// CHECK-INST: udot z0.h, z1.b, z2.b
+// CHECK-ENCODING: encoding: [0x20,0x04,0x42,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44420420 <unknown>
+
+// udot indexed
+
+udot z0.h, z0.b, z0.b[0]
+// CHECK-INST: udot z0.h, z0.b, z0.b[0]
+// CHECK-ENCODING: encoding: [0x00,0x04,0x20,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44200400 <unknown>
+
+udot z31.h, z31.b, z7.b[0]
+// CHECK-INST: udot z31.h, z31.b, z7.b[0]
+// CHECK-ENCODING: encoding: [0xff,0x07,0x27,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 442707ff <unknown>
+
+udot z0.h, z0.b, z0.b[1]
+// CHECK-INST: udot z0.h, z0.b, z0.b[1]
+// CHECK-ENCODING: encoding: [0x00,0x04,0x28,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44280400 <unknown>
+
+udot z31.h, z31.b, z7.b[1]
+// CHECK-INST: udot z31.h, z31.b, z7.b[1]
+// CHECK-ENCODING: encoding: [0xff,0x07,0x2f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 442f07ff <unknown>
+
+udot z0.h, z0.b, z0.b[7]
+// CHECK-INST: udot z0.h, z0.b, z0.b[7]
+// CHECK-ENCODING: encoding: [0x00,0x04,0x78,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44780400 <unknown>
+
+udot z31.h, z31.b, z7.b[7]
+// CHECK-INST: udot z31.h, z31.b, z7.b[7]
+// CHECK-ENCODING: encoding: [0xff,0x07,0x7f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 447f07ff <unknown>
+
+movprfx z0, z7
+udot z0.h, z1.b, z2.b[0]
+// CHECK-INST: movprfx z0, z7
+// CHECK-INST: udot z0.h, z1.b, z2.b[0]
+// CHECK-ENCODING: encoding: [0x20,0x04,0x22,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44220420 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2p3/luti6-diagnostics.s b/llvm/test/MC/AArch64/SVE2p3/luti6-diagnostics.s
new file mode 100644
index 0000000..21b4df9
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/luti6-diagnostics.s
@@ -0,0 +1,70 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+luti6 z10.h, { z0.b, z1.b }, z0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: luti6 z10.h, { z0.b, z1.b }, z0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 z10.s, { z0.b, z1.b }, z0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: luti6 z10.s, { z0.b, z1.b }, z0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0.h, p0/m, z7.h
+luti6 z0.b, { z2.b, z3.b }, z4
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 z0.b, { z2.b, z3.b }, z4
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0, z7
+luti6 z0.b, { z2.b, z3.b }, z4
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 z0.b, { z2.b, z3.b }, z4
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+luti6 z10.s, { z0.h, z1.h }, z0[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: luti6 z10.s, { z0.h, z1.h }, z0[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 z10.b, { z0.h, z1.h }, z0[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: luti6 z10.b, { z0.h, z1.h }, z0[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid immediate range
+
+luti6 z10.h, { z0.h, z1.h }, z0[-1]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 1].
+// CHECK-NEXT: luti6 z10.h, { z0.h, z1.h }, z0[-1]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 z10.h, { z0.h, z1.h }, z0[2]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 1].
+// CHECK-NEXT: luti6 z10.h, { z0.h, z1.h }, z0[2]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0.h, p0/m, z7.h
+luti6 z0.h, { z2.h, z3.h }, z4[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 z0.h, { z2.h, z3.h }, z4[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0, z7
+luti6 z0.h, { z2.h, z3.h }, z4[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 z0.h, { z2.h, z3.h }, z4[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SVE2p3/luti6.s b/llvm/test/MC/AArch64/SVE2p3/luti6.s
new file mode 100644
index 0000000..848091c
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/luti6.s
@@ -0,0 +1,115 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sve2p3 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+// ---------------------------------------------------------------
+// Lookup table read with 6-bit indices (8-bit)
+
+luti6 z0.b, { z0.b, z1.b }, z0
+// CHECK-INST: luti6 z0.b, { z0.b, z1.b }, z0
+// CHECK-ENCODING: encoding: [0x00,0xac,0x20,0x45]
+// CHECK-ERROR: instruction requires: sve2p3
+// CHECK-UNKNOWN: 4520ac00 <unknown>
+
+luti6 z10.b, { z0.b, z1.b }, z0
+// CHECK-INST: luti6 z10.b, { z0.b, z1.b }, z0
+// CHECK-ENCODING: encoding: [0x0a,0xac,0x20,0x45]
+// CHECK-ERROR: instruction requires: sve2p3
+// CHECK-UNKNOWN: 4520ac0a <unknown>
+
+luti6 z21.b, { z0.b, z1.b }, z0
+// CHECK-INST: luti6 z21.b, { z0.b, z1.b }, z0
+// CHECK-ENCODING: encoding: [0x15,0xac,0x20,0x45]
+// CHECK-ERROR: instruction requires: sve2p3
+// CHECK-UNKNOWN: 4520ac15 <unknown>
+
+luti6 z31.b, { z0.b, z1.b }, z0
+// CHECK-INST: luti6 z31.b, { z0.b, z1.b }, z0
+// CHECK-ENCODING: encoding: [0x1f,0xac,0x20,0x45]
+// CHECK-ERROR: instruction requires: sve2p3
+// CHECK-UNKNOWN: 4520ac1f <unknown>
+
+luti6 z0.b, { z31.b, z0.b }, z31
+// CHECK-INST: luti6 z0.b, { z31.b, z0.b }, z31
+// CHECK-ENCODING: encoding: [0xe0,0xaf,0x3f,0x45]
+// CHECK-ERROR: instruction requires: sve2p3
+// CHECK-UNKNOWN: 453fafe0 <unknown>
+
+luti6 z10.b, { z31.b, z0.b }, z31
+// CHECK-INST: luti6 z10.b, { z31.b, z0.b }, z31
+// CHECK-ENCODING: encoding: [0xea,0xaf,0x3f,0x45]
+// CHECK-ERROR: instruction requires: sve2p3
+// CHECK-UNKNOWN: 453fafea <unknown>
+
+luti6 z21.b, { z31.b, z0.b }, z31
+// CHECK-INST: luti6 z21.b, { z31.b, z0.b }, z31
+// CHECK-ENCODING: encoding: [0xf5,0xaf,0x3f,0x45]
+// CHECK-ERROR: instruction requires: sve2p3
+// CHECK-UNKNOWN: 453faff5 <unknown>
+
+luti6 z31.b, { z31.b, z0.b }, z31
+// CHECK-INST: luti6 z31.b, { z31.b, z0.b }, z31
+// CHECK-ENCODING: encoding: [0xff,0xaf,0x3f,0x45]
+// CHECK-ERROR: instruction requires: sve2p3
+// CHECK-UNKNOWN: 453fafff <unknown>
+
+// ---------------------------------------------------------------
+// Lookup table read with 6-bit indices (16-bit)
+
+luti6 z0.h, { z0.h, z1.h }, z0[0]
+// CHECK-INST: luti6 z0.h, { z0.h, z1.h }, z0[0]
+// CHECK-ENCODING: encoding: [0x00,0xac,0x60,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4560ac00 <unknown>
+
+luti6 z10.h, { z0.h, z1.h }, z0[0]
+// CHECK-INST: luti6 z10.h, { z0.h, z1.h }, z0[0]
+// CHECK-ENCODING: encoding: [0x0a,0xac,0x60,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4560ac0a <unknown>
+
+luti6 z21.h, { z0.h, z1.h }, z0[0]
+// CHECK-INST: luti6 z21.h, { z0.h, z1.h }, z0[0]
+// CHECK-ENCODING: encoding: [0x15,0xac,0x60,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4560ac15 <unknown>
+
+luti6 z31.h, { z0.h, z1.h }, z0[0]
+// CHECK-INST: luti6 z31.h, { z0.h, z1.h }, z0[0]
+// CHECK-ENCODING: encoding: [0x1f,0xac,0x60,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4560ac1f <unknown>
+
+luti6 z0.h, { z31.h, z0.h }, z31[1]
+// CHECK-INST: luti6 z0.h, { z31.h, z0.h }, z31[1]
+// CHECK-ENCODING: encoding: [0xe0,0xaf,0xff,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45ffafe0 <unknown>
+
+luti6 z10.h, { z31.h, z0.h }, z31[1]
+// CHECK-INST: luti6 z10.h, { z31.h, z0.h }, z31[1]
+// CHECK-ENCODING: encoding: [0xea,0xaf,0xff,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45ffafea <unknown>
+
+luti6 z21.h, { z31.h, z0.h }, z31[1]
+// CHECK-INST: luti6 z21.h, { z31.h, z0.h }, z31[1]
+// CHECK-ENCODING: encoding: [0xf5,0xaf,0xff,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45ffaff5 <unknown>
+
+luti6 z31.h, { z31.h, z0.h }, z31[1]
+// CHECK-INST: luti6 z31.h, { z31.h, z0.h }, z31[1]
+// CHECK-ENCODING: encoding: [0xff,0xaf,0xff,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45ffafff <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2p3/qshrn-diagnostics.s b/llvm/test/MC/AArch64/SVE2p3/qshrn-diagnostics.s
new file mode 100644
index 0000000..a4dbae2
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/qshrn-diagnostics.s
@@ -0,0 +1,343 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+sqrshrn z10.s, { z0.s, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sqrshrn z10.s, { z0.s, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqrshrn z10.d, { z0.d, z1.d }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sqrshrn z10.d, { z0.d, z1.d }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Mismatched register size suffix
+
+sqrshrn z0.b, { z0.h, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix
+// CHECK-NEXT: sqrshrn z0.b, { z0.h, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid operand for instruction
+
+sqrshrn z10.h, { z0.b, z1.b }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: sqrshrn z10.h, { z0.b, z1.b }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Intermediate out of range
+
+sqrshrn z10.b, { z0.h, z1.h }, #0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: sqrshrn z10.b, { z0.h, z1.h }, #0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqrshrn z10.b, { z0.h, z1.h }, #9
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: sqrshrn z10.b, { z0.h, z1.h }, #9
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqrshrn z10.b, { z1.h, z2.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: sqrshrn z10.b, { z1.h, z2.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+sqrshrn z0.b, { z2.h, z3.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: sqrshrn z0.b, { z2.h, z3.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+sqrshrun z10.s, { z0.s, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sqrshrun z10.s, { z0.s, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqrshrun z10.d, { z0.d, z1.d }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sqrshrun z10.d, { z0.d, z1.d }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid operand for instruction
+
+sqrshrun z10.h, { z0.b, z1.b }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: sqrshrun z10.h, { z0.b, z1.b }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Intermediate out of range
+
+sqrshrun z10.b, { z0.h, z1.h }, #0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: sqrshrun z10.b, { z0.h, z1.h }, #0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqrshrun z10.b, { z0.h, z1.h }, #9
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: sqrshrun z10.b, { z0.h, z1.h }, #9
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqrshrun z10.b, { z1.h, z2.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: sqrshrun z10.b, { z1.h, z2.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Mismatched register size suffix
+
+sqrshrun z0.b, { z0.h, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix
+// CHECK-NEXT: sqrshrun z0.b, { z0.h, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+sqrshrun z0.b, { z0.h, z1.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: sqrshrun z0.b, { z0.h, z1.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+sqshrn z10.s, { z0.s, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sqshrn z10.s, { z0.s, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrn z10.d, { z0.d, z1.d }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sqshrn z10.d, { z0.d, z1.d }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Mismatched register size suffix
+
+sqshrn z0.b, { z0.h, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix
+// CHECK-NEXT: sqshrn z0.b, { z0.h, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Intermediate out of range
+
+sqshrn z10.b, { z0.h, z1.h }, #0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: sqshrn z10.b, { z0.h, z1.h }, #0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrn z10.b, { z0.h, z1.h }, #9
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: sqshrn z10.b, { z0.h, z1.h }, #9
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrn z0.h, { z0.s, z1.s }, #0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 16].
+// CHECK-NEXT: sqshrn z0.h, { z0.s, z1.s }, #0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrn z0.h, { z0.s, z1.s }, #17
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 16].
+// CHECK-NEXT: sqshrn z0.h, { z0.s, z1.s }, #17
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrn z10.b, { z1.h, z2.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: sqshrn z10.b, { z1.h, z2.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+sqshrn z0.b, { z0.h, z1.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: sqshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+sqshrun z10.s, { z0.s, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sqshrun z10.s, { z0.s, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrun z10.d, { z0.d, z1.d }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sqshrun z10.d, { z0.d, z1.d }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Mismatched register size suffix
+
+sqshrun z0.b, { z0.h, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix
+// CHECK-NEXT: sqshrun z0.b, { z0.h, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Intermediate out of range
+
+sqshrun z10.b, { z0.h, z1.h }, #0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: sqshrun z10.b, { z0.h, z1.h }, #0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrun z10.b, { z0.h, z1.h }, #9
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: sqshrun z10.b, { z0.h, z1.h }, #9
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrun z10.h, { z0.s, z1.s }, #0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 16].
+// CHECK-NEXT: sqshrun z10.h, { z0.s, z1.s }, #0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrun z10.h, { z0.s, z1.s }, #17
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 16].
+// CHECK-NEXT: sqshrun z10.h, { z0.s, z1.s }, #17
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrun z10.b, { z1.h, z2.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: sqshrun z10.b, { z1.h, z2.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+sqshrun z0.b, { z0.h, z1.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: sqshrun z0.b, { z0.h, z1.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+uqrshrn z10.s, { z0.s, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: uqrshrn z10.s, { z0.s, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uqrshrn z10.d, { z0.d, z1.d }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: uqrshrn z10.d, { z0.d, z1.d }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Mismatched register size suffix
+
+uqrshrn z0.b, { z0.h, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix
+// CHECK-NEXT: uqrshrn z0.b, { z0.h, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid operand for instruction
+
+uqrshrn z10.h, { z0.b, z1.b }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: uqrshrn z10.h, { z0.b, z1.b }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Intermediate out of range
+
+uqrshrn z10.b, { z0.h, z1.h }, #0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: uqrshrn z10.b, { z0.h, z1.h }, #0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uqrshrn z10.b, { z0.h, z1.h }, #9
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: uqrshrn z10.b, { z0.h, z1.h }, #9
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uqrshrn z10.b, { z1.h, z2.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: uqrshrn z10.b, { z1.h, z2.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+uqrshrn z0.b, { z0.h, z1.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: uqrshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+uqshrn z10.s, { z0.s, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: uqshrn z10.s, { z0.s, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uqshrn z10.d, { z0.d, z1.d }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: uqshrn z10.d, { z0.d, z1.d }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Mismatched register size suffix
+
+uqshrn z0.b, { z0.h, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix
+// CHECK-NEXT: uqshrn z0.b, { z0.h, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Intermediate out of range
+
+uqshrn z10.b, { z0.h, z1.h }, #0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: uqshrn z10.b, { z0.h, z1.h }, #0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uqshrn z10.b, { z0.h, z1.h }, #9
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: uqshrn z10.b, { z0.h, z1.h }, #9
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uqshrn z0.h, { z0.s, z1.s }, #0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 16].
+// CHECK-NEXT: uqshrn z0.h, { z0.s, z1.s }, #0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uqshrn z0.h, { z0.s, z1.s }, #17
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 16].
+// CHECK-NEXT: uqshrn z0.h, { z0.s, z1.s }, #17
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uqshrn z10.b, { z1.h, z2.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: uqshrn z10.b, { z1.h, z2.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+uqshrn z0.b, { z0.h, z1.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: uqshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SVE2p3/qshrn.s b/llvm/test/MC/AArch64/SVE2p3/qshrn.s
new file mode 100644
index 0000000..31c87cf
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/qshrn.s
@@ -0,0 +1,255 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p3 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sve2p3 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+// -----------------------------------------------------------------
+// Signed saturating rounding shift right narrow by immediate and interleave
+
+sqrshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-INST: sqrshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-ENCODING: encoding: [0x00,0x28,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af2800 <unknown>
+
+sqrshrn z31.b, { z30.h, z31.h }, #1
+// CHECK-INST: sqrshrn z31.b, { z30.h, z31.h }, #1
+// CHECK-ENCODING: encoding: [0xdf,0x2b,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af2bdf <unknown>
+
+sqrshrn z0.b, { z0.h, z1.h }, #8
+// CHECK-INST: sqrshrn z0.b, { z0.h, z1.h }, #8
+// CHECK-ENCODING: encoding: [0x00,0x28,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a82800 <unknown>
+
+sqrshrn z31.b, { z30.h, z31.h }, #3
+// CHECK-INST: sqrshrn z31.b, { z30.h, z31.h }, #3
+// CHECK-ENCODING: encoding: [0xdf,0x2b,0xad,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45ad2bdf <unknown>
+
+// -----------------------------------------------------------------
+// Signed saturating rounding shift right unsigned narrow by immediate and interleave
+
+sqrshrun z0.b, { z0.h, z1.h }, #1
+// CHECK-INST: sqrshrun z0.b, { z0.h, z1.h }, #1
+// CHECK-ENCODING: encoding: [0x00,0x08,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af0800 <unknown>
+
+sqrshrun z31.b, { z30.h, z31.h }, #1
+// CHECK-INST: sqrshrun z31.b, { z30.h, z31.h }, #1
+// CHECK-ENCODING: encoding: [0xdf,0x0b,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af0bdf <unknown>
+
+sqrshrun z0.b, { z0.h, z1.h }, #8
+// CHECK-INST: sqrshrun z0.b, { z0.h, z1.h }, #8
+// CHECK-ENCODING: encoding: [0x00,0x08,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a80800 <unknown>
+
+sqrshrun z31.b, { z30.h, z31.h }, #8
+// CHECK-INST: sqrshrun z31.b, { z30.h, z31.h }, #8
+// CHECK-ENCODING: encoding: [0xdf,0x0b,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a80bdf <unknown>
+
+// -----------------------------------------------------------------
+// Signed saturating shift right narrow by immediate and interleave
+
+sqshrn z21.b, { z30.h, z31.h }, #1
+// CHECK-INST: sqshrn z21.b, { z30.h, z31.h }, #1
+// CHECK-ENCODING: encoding: [0xd5,0x03,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af03d5 <unknown>
+
+sqshrn z31.b, { z30.h, z31.h }, #1
+// CHECK-INST: sqshrn z31.b, { z30.h, z31.h }, #1
+// CHECK-ENCODING: encoding: [0xdf,0x03,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af03df <unknown>
+
+sqshrn z10.b, { z0.h, z1.h }, #8
+// CHECK-INST: sqshrn z10.b, { z0.h, z1.h }, #8
+// CHECK-ENCODING: encoding: [0x0a,0x00,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a8000a <unknown>
+
+sqshrn z31.b, { z30.h, z31.h }, #8
+// CHECK-INST: sqshrn z31.b, { z30.h, z31.h }, #8
+// CHECK-ENCODING: encoding: [0xdf,0x03,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a803df <unknown>
+
+sqshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-INST: sqshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-ENCODING: encoding: [0x00,0x00,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af0000 <unknown>
+
+sqshrn z0.h, { z0.s, z1.s }, #1
+// CHECK-INST: sqshrn z0.h, { z0.s, z1.s }, #1
+// CHECK-ENCODING: encoding: [0x00,0x00,0xbf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45bf0000 <unknown>
+
+sqshrn z31.h, { z30.s, z31.s }, #1
+// CHECK-INST: sqshrn z31.h, { z30.s, z31.s }, #1
+// CHECK-ENCODING: encoding: [0xdf,0x03,0xbf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45bf03df <unknown>
+
+sqshrn z0.h, { z0.s, z1.s }, #16
+// CHECK-INST: sqshrn z0.h, { z0.s, z1.s }, #16
+// CHECK-ENCODING: encoding: [0x00,0x00,0xb0,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45b00000 <unknown>
+
+sqshrn z31.h, { z30.s, z31.s }, #16
+// CHECK-INST: sqshrn z31.h, { z30.s, z31.s }, #16
+// CHECK-ENCODING: encoding: [0xdf,0x03,0xb0,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45b003df <unknown>
+
+// -----------------------------------------------------------------
+// Signed saturating shift right unsigned narrow by immediate and interleave
+
+sqshrun z0.b, { z0.h, z1.h }, #1
+// CHECK-INST: sqshrun z0.b, { z0.h, z1.h }, #1
+// CHECK-ENCODING: encoding: [0x00,0x20,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af2000 <unknown>
+
+sqshrun z31.b, { z30.h, z31.h }, #1
+// CHECK-INST: sqshrun z31.b, { z30.h, z31.h }, #1
+// CHECK-ENCODING: encoding: [0xdf,0x23,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af23df <unknown>
+
+sqshrun z0.b, { z0.h, z1.h }, #8
+// CHECK-INST: sqshrun z0.b, { z0.h, z1.h }, #8
+// CHECK-ENCODING: encoding: [0x00,0x20,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a82000 <unknown>
+
+sqshrun z31.b, { z30.h, z31.h }, #8
+// CHECK-INST: sqshrun z31.b, { z30.h, z31.h }, #8
+// CHECK-ENCODING: encoding: [0xdf,0x23,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a823df <unknown>
+
+sqshrun z0.h, { z0.s, z1.s }, #1
+// CHECK-INST: sqshrun z0.h, { z0.s, z1.s }, #1
+// CHECK-ENCODING: encoding: [0x00,0x20,0xbf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45bf2000 <unknown>
+
+sqshrun z31.h, { z30.s, z31.s }, #1
+// CHECK-INST: sqshrun z31.h, { z30.s, z31.s }, #1
+// CHECK-ENCODING: encoding: [0xdf,0x23,0xbf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45bf23df <unknown>
+
+sqshrun z0.h, { z0.s, z1.s }, #16
+// CHECK-INST: sqshrun z0.h, { z0.s, z1.s }, #16
+// CHECK-ENCODING: encoding: [0x00,0x20,0xb0,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45b02000 <unknown>
+
+sqshrun z31.h, { z30.s, z31.s }, #16
+// CHECK-INST: sqshrun z31.h, { z30.s, z31.s }, #16
+// CHECK-ENCODING: encoding: [0xdf,0x23,0xb0,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45b023df <unknown>
+
+// -----------------------------------------------------------------
+// Unsigned saturating rounding shift right narrow by immediate and interleave
+
+uqrshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-INST: uqrshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-ENCODING: encoding: [0x00,0x38,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af3800 <unknown>
+
+uqrshrn z31.b, { z30.h, z31.h }, #1
+// CHECK-INST: uqrshrn z31.b, { z30.h, z31.h }, #1
+// CHECK-ENCODING: encoding: [0xdf,0x3b,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af3bdf <unknown>
+
+uqrshrn z0.b, { z0.h, z1.h }, #8
+// CHECK-INST: uqrshrn z0.b, { z0.h, z1.h }, #8
+// CHECK-ENCODING: encoding: [0x00,0x38,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a83800 <unknown>
+
+uqrshrn z31.b, { z30.h, z31.h }, #8
+// CHECK-INST: uqrshrn z31.b, { z30.h, z31.h }, #8
+// CHECK-ENCODING: encoding: [0xdf,0x3b,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a83bdf <unknown>
+
+// -----------------------------------------------------------------
+// Unsigned saturating shift right narrow by immediate and interleave
+
+uqshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-INST: uqshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-ENCODING: encoding: [0x00,0x10,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af1000 <unknown>
+
+uqshrn z31.b, { z30.h, z31.h }, #1
+// CHECK-INST: uqshrn z31.b, { z30.h, z31.h }, #1
+// CHECK-ENCODING: encoding: [0xdf,0x13,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af13df <unknown>
+
+uqshrn z0.b, { z0.h, z1.h }, #8
+// CHECK-INST: uqshrn z0.b, { z0.h, z1.h }, #8
+// CHECK-ENCODING: encoding: [0x00,0x10,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a81000 <unknown>
+
+uqshrn z31.b, { z30.h, z31.h }, #8
+// CHECK-INST: uqshrn z31.b, { z30.h, z31.h }, #8
+// CHECK-ENCODING: encoding: [0xdf,0x13,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a813df <unknown>
+
+uqshrn z0.h, { z0.s, z1.s }, #1
+// CHECK-INST: uqshrn z0.h, { z0.s, z1.s }, #1
+// CHECK-ENCODING: encoding: [0x00,0x10,0xbf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45bf1000 <unknown>
+
+uqshrn z31.h, { z30.s, z31.s }, #1
+// CHECK-INST: uqshrn z31.h, { z30.s, z31.s }, #1
+// CHECK-ENCODING: encoding: [0xdf,0x13,0xbf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45bf13df <unknown>
+
+uqshrn z0.h, { z0.s, z1.s }, #16
+// CHECK-INST: uqshrn z0.h, { z0.s, z1.s }, #16
+// CHECK-ENCODING: encoding: [0x00,0x10,0xb0,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45b01000 <unknown>
+
+uqshrn z31.h, { z30.s, z31.s }, #16
+// CHECK-INST: uqshrn z31.h, { z30.s, z31.s }, #16
+// CHECK-ENCODING: encoding: [0xdf,0x13,0xb0,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45b013df <unknown>
diff --git a/llvm/test/MC/AArch64/armv8.4a-mpam.s b/llvm/test/MC/AArch64/armv8.4a-mpam.s
index 14787e6..7469227 100644
--- a/llvm/test/MC/AArch64/armv8.4a-mpam.s
+++ b/llvm/test/MC/AArch64/armv8.4a-mpam.s
@@ -1,6 +1,4 @@
 // RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.4a < %s 2> %t | FileCheck %s --check-prefix=CHECK
-// RUN: FileCheck --check-prefix=CHECK-RO < %t %s
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=-v8.4a < %s 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
 
 //------------------------------------------------------------------------------
 // ARMV8.4-A MPAM Extensions
@@ -56,9 +54,6 @@ mrs x0, MPAMIDR_EL1
 //CHECK:  msr MPAMVPM6_EL2, x0        // encoding: [0xc0,0xa6,0x1c,0xd5]
 //CHECK:  msr MPAMVPM7_EL2, x0        // encoding: [0xe0,0xa6,0x1c,0xd5]
 
-//CHECK-RO: error: expected writable system register or pstate
-//CHECK-RO: msr MPAMIDR_EL1, x0
-//CHECK-RO:     ^
 
 //CHECK:  mrs x0, MPAM0_EL1           // encoding: [0x20,0xa5,0x38,0xd5]
 //CHECK:  mrs x0, MPAM1_EL1           // encoding: [0x00,0xa5,0x38,0xd5]
@@ -77,100 +72,4 @@ mrs x0, MPAMIDR_EL1
 //CHECK:  mrs x0, MPAMVPM7_EL2        // encoding: [0xe0,0xa6,0x3c,0xd5]
 //CHECK:  mrs x0, MPAMIDR_EL1         // encoding: [0x80,0xa4,0x38,0xd5]
 
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAM0_EL1, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAM1_EL1, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAM2_EL2, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAM3_EL3, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAM1_EL12, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMHCR_EL2, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMVPMV_EL2, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMVPM0_EL2, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMVPM1_EL2, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMVPM2_EL2, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMVPM3_EL2, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMVPM4_EL2, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMVPM5_EL2, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMVPM6_EL2, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMVPM7_EL2, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMIDR_EL1, x0
-//CHECK-ERROR:     ^
 
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAM0_EL1
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAM1_EL1
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAM2_EL2
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAM3_EL3
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAM1_EL12
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMHCR_EL2
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMVPMV_EL2
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMVPM0_EL2
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMVPM1_EL2
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMVPM2_EL2
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMVPM3_EL2
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMVPM4_EL2
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMVPM5_EL2
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMVPM6_EL2
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMVPM7_EL2
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMIDR_EL1
-//CHECK-ERROR:         ^
diff --git a/llvm/test/MC/AArch64/armv9.7a-gcie-diagnostics.s b/llvm/test/MC/AArch64/armv9.7a-gcie-diagnostics.s
new file mode 100644
index 0000000..cffee7d
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.7a-gcie-diagnostics.s
@@ -0,0 +1,18 @@
+// RUN: not llvm-mc -triple=aarch64 -mattr=+gcie -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+
+//------------------------------------------------------------------------------
+// FEAT_GCIE instructions
+//------------------------------------------------------------------------------
+
+gsb
+// CHECK-ERROR: error: invalid operand for GSB instruction
+
+gicr
+// CHECK-ERROR: error: expected register operand
+
+gicr x3, foo
+// CHECK-ERROR: error: invalid operand for GICR instruction
+
+gic cdaff
+// CHECK-ERROR: error: specified gic op requires a register
diff --git a/llvm/test/MC/AArch64/armv9.7a-gcie.s b/llvm/test/MC/AArch64/armv9.7a-gcie.s
new file mode 100644
index 0000000..4fd5d25
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.7a-gcie.s
@@ -0,0 +1,985 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+gcie < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+gcie < %s \
+// RUN:        | llvm-objdump -d --mattr=+gcie --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+gcie < %s \
+// RUN:        | llvm-objdump -d --mattr=-gcie --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+gcie < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+gcie -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+//------------------------------------------------------------------------------
+// Armv9.7-A FEAT_GCIE Extensions
+//------------------------------------------------------------------------------
+
+// CPU Interface Registers MRS Instruction - Encodings Checked
+MRS x3, ICC_APR_EL1
+// CHECK-INST:    mrs x3, ICC_APR_EL1
+// CHECK-ENCODING: [0x03,0xc0,0x39,0xd5]
+// CHECK-UNKNOWN: d539c003
+
+MRS x3, ICC_APR_EL3
+// CHECK-INST:    mrs x3, ICC_APR_EL3
+// CHECK-ENCODING: [0x03,0xc8,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ec803
+
+MRS x3, ICC_CR0_EL1
+// CHECK-INST:    mrs x3, ICC_CR0_EL1
+// CHECK-ENCODING: [0x23,0xc0,0x39,0xd5]
+// CHECK-UNKNOWN: d539c023
+
+MRS x3, ICC_CR0_EL3
+// CHECK-INST:    mrs x3, ICC_CR0_EL3
+// CHECK-ENCODING: [0x03,0xc9,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ec903
+
+MRS x3, ICC_DOMHPPIR_EL3
+// CHECK-INST:    mrs x3, ICC_DOMHPPIR_EL3
+// CHECK-ENCODING: [0x43,0xc8,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ec843
+
+MRS x3, ICC_HAPR_EL1
+// CHECK-INST:    mrs x3, ICC_HAPR_EL1
+// CHECK-ENCODING: [0x63,0xc0,0x39,0xd5]
+// CHECK-UNKNOWN: d539c063
+
+MRS x3, ICC_HPPIR_EL1
+// CHECK-INST:    mrs x3, ICC_HPPIR_EL1
+// CHECK-ENCODING: [0x63,0xca,0x38,0xd5]
+// CHECK-UNKNOWN: d538ca63
+
+MRS x3, ICC_HPPIR_EL3
+// CHECK-INST:    mrs x3, ICC_HPPIR_EL3
+// CHECK-ENCODING: [0x23,0xc9,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ec923
+
+MRS x3, ICC_IAFFIDR_EL1
+// CHECK-INST:    mrs x3, ICC_IAFFIDR_EL1
+// CHECK-ENCODING: [0xa3,0xca,0x38,0xd5]
+// CHECK-UNKNOWN: d538caa3
+
+MRS x3, ICC_ICSR_EL1
+// CHECK-INST:    mrs x3, ICC_ICSR_EL1
+// CHECK-ENCODING: [0x83,0xca,0x38,0xd5]
+// CHECK-UNKNOWN: d538ca83
+
+MRS x3, ICC_IDR0_EL1
+// CHECK-INST:    mrs x3, ICC_IDR0_EL1
+// CHECK-ENCODING: [0x43,0xca,0x38,0xd5]
+// CHECK-UNKNOWN: d538ca43
+
+MRS x3, ICC_PCR_EL1
+// CHECK-INST:    mrs x3, ICC_PCR_EL1
+// CHECK-ENCODING: [0x43,0xc0,0x39,0xd5]
+// CHECK-UNKNOWN: d539c043
+
+MRS x3, ICC_PCR_EL3
+// CHECK-INST:    mrs x3, ICC_PCR_EL3
+// CHECK-ENCODING: [0x23,0xc8,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ec823
+
+MRS x3, ICC_SRE_EL1
+// CHECK-INST:    mrs x3, ICC_SRE_EL1
+// CHECK-ENCODING: [0xa3,0xcc,0x38,0xd5]
+// CHECK-UNKNOWN: d538cca3
+
+// -----------------------------------------------
+MSR ICC_APR_EL1, x3
+// CHECK-INST:    msr ICC_APR_EL1, x3
+// CHECK-ENCODING: [0x03,0xc0,0x19,0xd5]
+// CHECK-UNKNOWN: d519c003
+
+MSR ICC_APR_EL3, x3
+// CHECK-INST:    msr ICC_APR_EL3, x3
+// CHECK-ENCODING: [0x03,0xc8,0x1e,0xd5]
+// CHECK-UNKNOWN: d51ec803
+
+MSR ICC_CR0_EL1, x3
+// CHECK-INST:    msr ICC_CR0_EL1, x3
+// CHECK-ENCODING: [0x23,0xc0,0x19,0xd5]
+// CHECK-UNKNOWN: d519c023
+
+MSR ICC_CR0_EL3, x3
+// CHECK-INST:    msr ICC_CR0_EL3, x3
+// CHECK-ENCODING: [0x03,0xc9,0x1e,0xd5]
+// CHECK-UNKNOWN: d51ec903
+
+MSR ICC_ICSR_EL1, x3
+// CHECK-INST:    msr ICC_ICSR_EL1, x3
+// CHECK-ENCODING: [0x83,0xca,0x18,0xd5]
+// CHECK-UNKNOWN: d518ca83
+
+MSR ICC_PCR_EL1, x3
+// CHECK-INST:    msr ICC_PCR_EL1, x3
+// CHECK-ENCODING: [0x43,0xc0,0x19,0xd5]
+// CHECK-UNKNOWN: d519c043
+
+MSR ICC_PCR_EL3, x3
+// CHECK-INST:    msr ICC_PCR_EL3, x3
+// CHECK-ENCODING: [0x23,0xc8,0x1e,0xd5]
+// CHECK-UNKNOWN: d51ec823
+
+
+// -----------------------------------------------
+// Virtual CPU Registers MRS Instructions
+
+// The specification says:
+//   "Each ICC_system register that is accessible at EL1 and higher and whose state
+//    is specific to the Virtual Interrupt Domain, has a corresponding virtual
+//    ICV_register. The ICV_registers are accessed using the same system register
+//    encodings as their ICC_counterparts."
+//
+// So expect ICC_* encodings here, not ICV_* encodings
+
+MRS x3, ICV_APR_EL1
+// CHECK-INST:    mrs x3, ICC_APR_EL1
+// CHECK-ENCODING: [0x03,0xc0,0x39,0xd5]
+// CHECK-UNKNOWN: d539c003
+
+MRS x3, ICV_CR0_EL1
+// CHECK-INST:    mrs x3, ICC_CR0_EL1
+// CHECK-ENCODING: [0x23,0xc0,0x39,0xd5]
+// CHECK-UNKNOWN: d539c023
+
+MRS x3, ICV_HAPR_EL1
+// CHECK-INST:    mrs x3, ICC_HAPR_EL1
+// CHECK-ENCODING: [0x63,0xc0,0x39,0xd5]
+// CHECK-UNKNOWN: d539c063
+
+MRS x3, ICV_HPPIR_EL1
+// CHECK-INST:    mrs x3, ICC_HPPIR_EL1
+// CHECK-ENCODING: [0x63,0xca,0x38,0xd5]
+// CHECK-UNKNOWN: d538ca63
+
+MRS x3, ICV_PCR_EL1
+// CHECK-INST:    mrs x3, ICC_PCR_EL1
+// CHECK-ENCODING: [0x43,0xc0,0x39,0xd5]
+// CHECK-UNKNOWN: d539c043
+
+
+// -----------------------------------------------
+// Likewise here, expect ICC_* encodings here, not ICV_* encodings
+MSR ICV_APR_EL1, x3
+// CHECK-INST:    msr ICC_APR_EL1, x3
+// CHECK-ENCODING: [0x03,0xc0,0x19,0xd5]
+// CHECK-UNKNOWN: d519c003
+
+MSR ICV_CR0_EL1, x3
+// CHECK-INST:    msr ICC_CR0_EL1, x3
+// CHECK-ENCODING: [0x23,0xc0,0x19,0xd5]
+// CHECK-UNKNOWN: d519c023
+
+MSR ICV_PCR_EL1, x3
+// CHECK-INST:    msr ICC_PCR_EL1, x3
+// CHECK-ENCODING: [0x43,0xc0,0x19,0xd5]
+// CHECK-UNKNOWN: d519c043
+
+// -----------------------------------------------
+// PPI Registers MRS Instructions - Encodings Checked
+MRS x3, ICC_PPI_CACTIVER0_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_CACTIVER0_EL1
+// CHECK-ENCODING: [0x03,0xcd,0x38,0xd5]
+// CHECK-UNKNOWN: d538cd03
+
+MRS x3, ICC_PPI_CACTIVER1_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_CACTIVER1_EL1
+// CHECK-ENCODING: [0x23,0xcd,0x38,0xd5]
+// CHECK-UNKNOWN: d538cd23
+
+MRS x3, ICC_PPI_CPENDR0_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_CPENDR0_EL1
+// CHECK-ENCODING: [0x83,0xcd,0x38,0xd5]
+// CHECK-UNKNOWN: d538cd83
+
+MRS x3, ICC_PPI_CPENDR1_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_CPENDR1_EL1
+// CHECK-ENCODING: [0xa3,0xcd,0x38,0xd5]
+// CHECK-UNKNOWN: d538cda3
+
+MRS x3, ICC_PPI_DOMAINR0_EL3
+// CHECK-INST:    mrs x3, ICC_PPI_DOMAINR0_EL3
+// CHECK-ENCODING: [0x83,0xc8,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ec883
+
+MRS x3, ICC_PPI_DOMAINR1_EL3
+// CHECK-INST:    mrs x3, ICC_PPI_DOMAINR1_EL3
+// CHECK-ENCODING: [0xa3,0xc8,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ec8a3
+
+MRS x3, ICC_PPI_DOMAINR2_EL3
+// CHECK-INST:    mrs x3, ICC_PPI_DOMAINR2_EL3
+// CHECK-ENCODING: [0xc3,0xc8,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ec8c3
+
+MRS x3, ICC_PPI_DOMAINR3_EL3
+// CHECK-INST:    mrs x3, ICC_PPI_DOMAINR3_EL3
+// CHECK-ENCODING: [0xe3,0xc8,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ec8e3
+
+MRS x3, ICC_PPI_ENABLER0_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_ENABLER0_EL1
+// CHECK-ENCODING: [0xc3,0xca,0x38,0xd5]
+// CHECK-UNKNOWN: d538cac3
+
+MRS x3, ICC_PPI_ENABLER1_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_ENABLER1_EL1
+// CHECK-ENCODING: [0xe3,0xca,0x38,0xd5]
+// CHECK-UNKNOWN: d538cae3
+
+MRS x3, ICC_PPI_PRIORITYR0_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR0_EL1
+// CHECK-ENCODING: [0x03,0xce,0x38,0xd5]
+// CHECK-UNKNOWN: d538ce03
+
+MRS x3, ICC_PPI_PRIORITYR1_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR1_EL1
+// CHECK-ENCODING: [0x23,0xce,0x38,0xd5]
+// CHECK-UNKNOWN: d538ce23
+
+MRS x3, ICC_PPI_PRIORITYR2_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR2_EL1
+// CHECK-ENCODING: [0x43,0xce,0x38,0xd5]
+// CHECK-UNKNOWN: d538ce43
+
+MRS x3, ICC_PPI_PRIORITYR3_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR3_EL1
+// CHECK-ENCODING: [0x63,0xce,0x38,0xd5]
+// CHECK-UNKNOWN: d538ce63
+
+MRS x3, ICC_PPI_PRIORITYR4_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR4_EL1
+// CHECK-ENCODING: [0x83,0xce,0x38,0xd5]
+// CHECK-UNKNOWN: d538ce83
+
+MRS x3, ICC_PPI_PRIORITYR5_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR5_EL1
+// CHECK-ENCODING: [0xa3,0xce,0x38,0xd5]
+// CHECK-UNKNOWN: d538cea3
+
+MRS x3, ICC_PPI_PRIORITYR6_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR6_EL1
+// CHECK-ENCODING: [0xc3,0xce,0x38,0xd5]
+// CHECK-UNKNOWN: d538cec3
+
+MRS x3, ICC_PPI_PRIORITYR7_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR7_EL1
+// CHECK-ENCODING: [0xe3,0xce,0x38,0xd5]
+// CHECK-UNKNOWN: d538cee3
+
+MRS x3, ICC_PPI_PRIORITYR8_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR8_EL1
+// CHECK-ENCODING: [0x03,0xcf,0x38,0xd5]
+// CHECK-UNKNOWN: d538cf03
+
+MRS x3, ICC_PPI_PRIORITYR9_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR9_EL1
+// CHECK-ENCODING: [0x23,0xcf,0x38,0xd5]
+// CHECK-UNKNOWN: d538cf23
+
+MRS x3, ICC_PPI_PRIORITYR10_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR10_EL1
+// CHECK-ENCODING: [0x43,0xcf,0x38,0xd5]
+// CHECK-UNKNOWN: d538cf43
+
+MRS x3, ICC_PPI_PRIORITYR11_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR11_EL1
+// CHECK-ENCODING: [0x63,0xcf,0x38,0xd5]
+// CHECK-UNKNOWN: d538cf63
+
+MRS x3, ICC_PPI_PRIORITYR12_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR12_EL1
+// CHECK-ENCODING: [0x83,0xcf,0x38,0xd5]
+// CHECK-UNKNOWN: d538cf83
+
+MRS x3, ICC_PPI_PRIORITYR13_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR13_EL1
+// CHECK-ENCODING: [0xa3,0xcf,0x38,0xd5]
+// CHECK-UNKNOWN: d538cfa3
+
+MRS x3, ICC_PPI_PRIORITYR14_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR14_EL1
+// CHECK-ENCODING: [0xc3,0xcf,0x38,0xd5]
+// CHECK-UNKNOWN: d538cfc3
+
+MRS x3, ICC_PPI_PRIORITYR15_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR15_EL1
+// CHECK-ENCODING: [0xe3,0xcf,0x38,0xd5]
+// CHECK-UNKNOWN: d538cfe3
+
+MRS x3, ICC_PPI_SACTIVER0_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_SACTIVER0_EL1
+// CHECK-ENCODING: [0x43,0xcd,0x38,0xd5]
+// CHECK-UNKNOWN: d538cd43
+
+MRS x3, ICC_PPI_SACTIVER1_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_SACTIVER1_EL1
+// CHECK-ENCODING: [0x63,0xcd,0x38,0xd5]
+// CHECK-UNKNOWN: d538cd63
+
+MRS x3, ICC_PPI_SPENDR0_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_SPENDR0_EL1
+// CHECK-ENCODING: [0xc3,0xcd,0x38,0xd5]
+// CHECK-UNKNOWN: d538cdc3
+
+MRS x3, ICC_PPI_SPENDR1_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_SPENDR1_EL1
+// CHECK-ENCODING: [0xe3,0xcd,0x38,0xd5]
+// CHECK-UNKNOWN: d538cde3
+
+MRS x3, ICC_PPI_HMR0_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_HMR0_EL1
+// CHECK-ENCODING: [0x03,0xca,0x38,0xd5]
+// CHECK-UNKNOWN: d538ca03
+
+MRS x3, ICC_PPI_HMR1_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_HMR1_EL1
+// CHECK-ENCODING: [0x23,0xca,0x38,0xd5]
+// CHECK-UNKNOWN: d538ca23
+
+// -----------------------------------------------
+// MSR PPI Registers Instructions
+MSR ICC_PPI_CACTIVER0_EL1, x3
+// CHECK-INST:    msr ICC_PPI_CACTIVER0_EL1, x3
+// CHECK-ENCODING: [0x03,0xcd,0x18,0xd5]
+// CHECK-UNKNOWN: d518cd03
+
+MSR ICC_PPI_CACTIVER1_EL1, x3
+// CHECK-INST:    msr ICC_PPI_CACTIVER1_EL1, x3
+// CHECK-ENCODING: [0x23,0xcd,0x18,0xd5]
+// CHECK-UNKNOWN: d518cd23
+
+MSR ICC_PPI_CPENDR0_EL1, x3
+// CHECK-INST:    msr ICC_PPI_CPENDR0_EL1, x3
+// CHECK-ENCODING: [0x83,0xcd,0x18,0xd5]
+// CHECK-UNKNOWN: d518cd83
+
+MSR ICC_PPI_CPENDR1_EL1, x3
+// CHECK-INST:    msr ICC_PPI_CPENDR1_EL1, x3
+// CHECK-ENCODING: [0xa3,0xcd,0x18,0xd5]
+// CHECK-UNKNOWN: d518cda3
+
+MSR ICC_PPI_DOMAINR0_EL3, x3
+// CHECK-INST:    msr ICC_PPI_DOMAINR0_EL3, x3
+// CHECK-ENCODING: [0x83,0xc8,0x1e,0xd5]
+// CHECK-UNKNOWN: d51ec883
+
+MSR ICC_PPI_DOMAINR1_EL3, x3
+// CHECK-INST:    msr ICC_PPI_DOMAINR1_EL3, x3
+// CHECK-ENCODING: [0xa3,0xc8,0x1e,0xd5]
+// CHECK-UNKNOWN: d51ec8a3
+
+MSR ICC_PPI_DOMAINR2_EL3, x3
+// CHECK-INST:    msr ICC_PPI_DOMAINR2_EL3, x3
+// CHECK-ENCODING: [0xc3,0xc8,0x1e,0xd5]
+// CHECK-UNKNOWN: d51ec8c3
+
+MSR ICC_PPI_DOMAINR3_EL3, x3
+// CHECK-INST:    msr ICC_PPI_DOMAINR3_EL3, x3
+// CHECK-ENCODING: [0xe3,0xc8,0x1e,0xd5]
+// CHECK-UNKNOWN: d51ec8e3
+
+MSR ICC_PPI_ENABLER0_EL1, x3
+// CHECK-INST:    msr ICC_PPI_ENABLER0_EL1, x3
+// CHECK-ENCODING: [0xc3,0xca,0x18,0xd5]
+// CHECK-UNKNOWN: d518cac3
+
+MSR ICC_PPI_ENABLER1_EL1, x3
+// CHECK-INST:    msr ICC_PPI_ENABLER1_EL1, x3
+// CHECK-ENCODING: [0xe3,0xca,0x18,0xd5]
+// CHECK-UNKNOWN: d518cae3
+
+MSR ICC_PPI_PRIORITYR0_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR0_EL1, x3
+// CHECK-ENCODING: [0x03,0xce,0x18,0xd5]
+// CHECK-UNKNOWN: d518ce03
+
+MSR ICC_PPI_PRIORITYR1_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR1_EL1, x3
+// CHECK-ENCODING: [0x23,0xce,0x18,0xd5]
+// CHECK-UNKNOWN: d518ce23
+
+MSR ICC_PPI_PRIORITYR2_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR2_EL1, x3
+// CHECK-ENCODING: [0x43,0xce,0x18,0xd5]
+// CHECK-UNKNOWN: d518ce43
+
+MSR ICC_PPI_PRIORITYR3_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR3_EL1, x3
+// CHECK-ENCODING: [0x63,0xce,0x18,0xd5]
+// CHECK-UNKNOWN: d518ce63
+
+MSR ICC_PPI_PRIORITYR4_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR4_EL1, x3
+// CHECK-ENCODING: [0x83,0xce,0x18,0xd5]
+// CHECK-UNKNOWN: d518ce83
+
+MSR ICC_PPI_PRIORITYR5_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR5_EL1, x3
+// CHECK-ENCODING: [0xa3,0xce,0x18,0xd5]
+// CHECK-UNKNOWN: d518cea3
+
+MSR ICC_PPI_PRIORITYR6_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR6_EL1, x3
+// CHECK-ENCODING: [0xc3,0xce,0x18,0xd5]
+// CHECK-UNKNOWN: d518cec3
+
+MSR ICC_PPI_PRIORITYR7_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR7_EL1, x3
+// CHECK-ENCODING: [0xe3,0xce,0x18,0xd5]
+// CHECK-UNKNOWN: d518cee3
+
+MSR ICC_PPI_PRIORITYR8_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR8_EL1, x3
+// CHECK-ENCODING: [0x03,0xcf,0x18,0xd5]
+// CHECK-UNKNOWN: d518cf03
+
+MSR ICC_PPI_PRIORITYR9_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR9_EL1, x3
+// CHECK-ENCODING: [0x23,0xcf,0x18,0xd5]
+// CHECK-UNKNOWN: d518cf23
+
+MSR ICC_PPI_PRIORITYR10_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR10_EL1, x3
+// CHECK-ENCODING: [0x43,0xcf,0x18,0xd5]
+// CHECK-UNKNOWN: d518cf43
+
+MSR ICC_PPI_PRIORITYR11_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR11_EL1, x3
+// CHECK-ENCODING: [0x63,0xcf,0x18,0xd5]
+// CHECK-UNKNOWN: d518cf63
+
+MSR ICC_PPI_PRIORITYR12_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR12_EL1, x3
+// CHECK-ENCODING: [0x83,0xcf,0x18,0xd5]
+// CHECK-UNKNOWN: d518cf83
+
+MSR ICC_PPI_PRIORITYR13_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR13_EL1, x3
+// CHECK-ENCODING: [0xa3,0xcf,0x18,0xd5]
+// CHECK-UNKNOWN: d518cfa3
+
+MSR ICC_PPI_PRIORITYR14_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR14_EL1, x3
+// CHECK-ENCODING: [0xc3,0xcf,0x18,0xd5]
+// CHECK-UNKNOWN: d518cfc3
+
+MSR ICC_PPI_PRIORITYR15_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR15_EL1, x3
+// CHECK-ENCODING: [0xe3,0xcf,0x18,0xd5]
+// CHECK-UNKNOWN: d518cfe3
+
+MSR ICC_PPI_SACTIVER0_EL1, x3
+// CHECK-INST:    msr ICC_PPI_SACTIVER0_EL1, x3
+// CHECK-ENCODING: [0x43,0xcd,0x18,0xd5]
+// CHECK-UNKNOWN: d518cd43
+
+MSR ICC_PPI_SACTIVER1_EL1, x3
+// CHECK-INST:    msr ICC_PPI_SACTIVER1_EL1, x3
+// CHECK-ENCODING: [0x63,0xcd,0x18,0xd5]
+// CHECK-UNKNOWN: d518cd63
+
+MSR ICC_PPI_SPENDR0_EL1, x3
+// CHECK-INST:    msr ICC_PPI_SPENDR0_EL1, x3
+// CHECK-ENCODING: [0xc3,0xcd,0x18,0xd5]
+// CHECK-UNKNOWN: d518cdc3
+
+MSR ICC_PPI_SPENDR1_EL1, x3
+// CHECK-INST:    msr ICC_PPI_SPENDR1_EL1, x3
+// CHECK-ENCODING: [0xe3,0xcd,0x18,0xd5]
+// CHECK-UNKNOWN: d518cde3
+
+// -----------------------------------------------
+// Hypervisor Control Register MRS Instructions
+MRS x3, ICH_APR_EL2
+// CHECK-INST:    mrs x3, ICH_APR_EL2
+// CHECK-ENCODING: [0x83,0xc8,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cc883
+
+MRS x3, ICH_CONTEXTR_EL2
+// CHECK-INST:    mrs x3, ICH_CONTEXTR_EL2
+// CHECK-ENCODING: [0xc3,0xcb,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccbc3
+
+MRS x3, ICH_HFGITR_EL2
+// CHECK-INST:    mrs x3, ICH_HFGITR_EL2
+// CHECK-ENCODING: [0xe3,0xc9,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cc9e3
+
+MRS x3, ICH_HFGRTR_EL2
+// CHECK-INST:    mrs x3, ICH_HFGRTR_EL2
+// CHECK-ENCODING: [0x83,0xc9,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cc983
+
+MRS x3, ICH_HFGWTR_EL2
+// CHECK-INST:    mrs x3, ICH_HFGWTR_EL2
+// CHECK-ENCODING: [0xc3,0xc9,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cc9c3
+
+MRS x3, ICH_HPPIR_EL2
+// CHECK-INST:    mrs x3, ICH_HPPIR_EL2
+// CHECK-ENCODING: [0xa3,0xc8,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cc8a3
+
+MRS x3, ICH_PPI_ACTIVER0_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_ACTIVER0_EL2
+// CHECK-ENCODING: [0xc3,0xca,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccac3
+
+MRS x3, ICH_PPI_ACTIVER1_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_ACTIVER1_EL2
+// CHECK-ENCODING: [0xe3,0xca,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccae3
+
+MRS x3, ICH_PPI_DVIR0_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_DVIR0_EL2
+// CHECK-ENCODING: [0x03,0xca,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cca03
+
+MRS x3, ICH_PPI_DVIR1_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_DVIR1_EL2
+// CHECK-ENCODING: [0x23,0xca,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cca23
+
+MRS x3, ICH_PPI_ENABLER0_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_ENABLER0_EL2
+// CHECK-ENCODING: [0x43,0xca,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cca43
+
+MRS x3, ICH_PPI_ENABLER1_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_ENABLER1_EL2
+// CHECK-ENCODING: [0x63,0xca,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cca63
+
+MRS x3, ICH_PPI_PENDR0_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PENDR0_EL2
+// CHECK-ENCODING: [0x83,0xca,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cca83
+
+MRS x3, ICH_PPI_PENDR1_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PENDR1_EL2
+// CHECK-ENCODING: [0xa3,0xca,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccaa3
+
+MRS x3, ICH_PPI_PRIORITYR0_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR0_EL2
+// CHECK-ENCODING: [0x03,0xce,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cce03
+
+MRS x3, ICH_PPI_PRIORITYR1_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR1_EL2
+// CHECK-ENCODING: [0x23,0xce,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cce23
+
+MRS x3, ICH_PPI_PRIORITYR2_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR2_EL2
+// CHECK-ENCODING: [0x43,0xce,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cce43
+
+MRS x3, ICH_PPI_PRIORITYR3_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR3_EL2
+// CHECK-ENCODING: [0x63,0xce,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cce63
+
+MRS x3, ICH_PPI_PRIORITYR4_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR4_EL2
+// CHECK-ENCODING: [0x83,0xce,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cce83
+
+MRS x3, ICH_PPI_PRIORITYR5_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR5_EL2
+// CHECK-ENCODING: [0xa3,0xce,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccea3
+
+MRS x3, ICH_PPI_PRIORITYR6_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR6_EL2
+// CHECK-ENCODING: [0xc3,0xce,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccec3
+
+MRS x3, ICH_PPI_PRIORITYR7_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR7_EL2
+// CHECK-ENCODING: [0xe3,0xce,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccee3
+
+MRS x3, ICH_PPI_PRIORITYR8_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR8_EL2
+// CHECK-ENCODING: [0x03,0xcf,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccf03
+
+MRS x3, ICH_PPI_PRIORITYR9_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR9_EL2
+// CHECK-ENCODING: [0x23,0xcf,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccf23
+
+MRS x3, ICH_PPI_PRIORITYR10_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR10_EL2
+// CHECK-ENCODING: [0x43,0xcf,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccf43
+
+MRS x3, ICH_PPI_PRIORITYR11_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR11_EL2
+// CHECK-ENCODING: [0x63,0xcf,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccf63
+
+MRS x3, ICH_PPI_PRIORITYR12_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR12_EL2
+// CHECK-ENCODING: [0x83,0xcf,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccf83
+
+MRS x3, ICH_PPI_PRIORITYR13_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR13_EL2
+// CHECK-ENCODING: [0xa3,0xcf,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccfa3
+
+MRS x3, ICH_PPI_PRIORITYR14_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR14_EL2
+// CHECK-ENCODING: [0xc3,0xcf,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccfc3
+
+MRS x3, ICH_PPI_PRIORITYR15_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR15_EL2
+// CHECK-ENCODING: [0xe3,0xcf,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccfe3
+
+MRS x3, ICH_VCTLR_EL2
+// CHECK-INST:    mrs x3, ICH_VCTLR_EL2
+// CHECK-ENCODING: [0x83,0xcb,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccb83
+
+// -----------------------------------------------
+// Hypervisor Control Register MSR Instructions
+MSR ICH_APR_EL2, x3
+// CHECK-INST:    msr ICH_APR_EL2, x3
+// CHECK-ENCODING: [0x83,0xc8,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cc883
+
+MSR ICH_CONTEXTR_EL2, x3
+// CHECK-INST:    msr ICH_CONTEXTR_EL2, x3
+// CHECK-ENCODING: [0xc3,0xcb,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccbc3
+
+MSR ICH_HFGITR_EL2, x3
+// CHECK-INST:    msr ICH_HFGITR_EL2, x3
+// CHECK-ENCODING: [0xe3,0xc9,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cc9e3
+
+MSR ICH_HFGRTR_EL2, x3
+// CHECK-INST:    msr ICH_HFGRTR_EL2, x3
+// CHECK-ENCODING: [0x83,0xc9,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cc983
+
+MSR ICH_HFGWTR_EL2, x3
+// CHECK-INST:    msr ICH_HFGWTR_EL2, x3
+// CHECK-ENCODING: [0xc3,0xc9,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cc9c3
+
+MSR ICH_PPI_ACTIVER0_EL2, x3
+// CHECK-INST:    msr ICH_PPI_ACTIVER0_EL2, x3
+// CHECK-ENCODING: [0xc3,0xca,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccac3
+
+MSR ICH_PPI_ACTIVER1_EL2, x3
+// CHECK-INST:    msr ICH_PPI_ACTIVER1_EL2, x3
+// CHECK-ENCODING: [0xe3,0xca,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccae3
+
+MSR ICH_PPI_DVIR0_EL2, x3
+// CHECK-INST:    msr ICH_PPI_DVIR0_EL2, x3
+// CHECK-ENCODING: [0x03,0xca,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cca03
+
+MSR ICH_PPI_DVIR1_EL2, x3
+// CHECK-INST:    msr ICH_PPI_DVIR1_EL2, x3
+// CHECK-ENCODING: [0x23,0xca,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cca23
+
+MSR ICH_PPI_ENABLER0_EL2, x3
+// CHECK-INST:    msr ICH_PPI_ENABLER0_EL2, x3
+// CHECK-ENCODING: [0x43,0xca,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cca43
+
+MSR ICH_PPI_ENABLER1_EL2, x3
+// CHECK-INST:    msr ICH_PPI_ENABLER1_EL2, x3
+// CHECK-ENCODING: [0x63,0xca,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cca63
+
+MSR ICH_PPI_PENDR0_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PENDR0_EL2, x3
+// CHECK-ENCODING: [0x83,0xca,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cca83
+
+MSR ICH_PPI_PENDR1_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PENDR1_EL2, x3
+// CHECK-ENCODING: [0xa3,0xca,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccaa3
+
+MSR ICH_PPI_PRIORITYR0_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR0_EL2, x3
+// CHECK-ENCODING: [0x03,0xce,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cce03
+
+MSR ICH_PPI_PRIORITYR1_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR1_EL2, x3
+// CHECK-ENCODING: [0x23,0xce,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cce23
+
+MSR ICH_PPI_PRIORITYR2_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR2_EL2, x3
+// CHECK-ENCODING: [0x43,0xce,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cce43
+
+MSR ICH_PPI_PRIORITYR3_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR3_EL2, x3
+// CHECK-ENCODING: [0x63,0xce,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cce63
+
+MSR ICH_PPI_PRIORITYR4_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR4_EL2, x3
+// CHECK-ENCODING: [0x83,0xce,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cce83
+
+MSR ICH_PPI_PRIORITYR5_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR5_EL2, x3
+// CHECK-ENCODING: [0xa3,0xce,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccea3
+
+MSR ICH_PPI_PRIORITYR6_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR6_EL2, x3
+// CHECK-ENCODING: [0xc3,0xce,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccec3
+
+MSR ICH_PPI_PRIORITYR7_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR7_EL2, x3
+// CHECK-ENCODING: [0xe3,0xce,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccee3
+
+MSR ICH_PPI_PRIORITYR8_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR8_EL2, x3
+// CHECK-ENCODING: [0x03,0xcf,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccf03
+
+MSR ICH_PPI_PRIORITYR9_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR9_EL2, x3
+// CHECK-ENCODING: [0x23,0xcf,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccf23
+
+MSR ICH_PPI_PRIORITYR10_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR10_EL2, x3
+// CHECK-ENCODING: [0x43,0xcf,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccf43
+
+MSR ICH_PPI_PRIORITYR11_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR11_EL2, x3
+// CHECK-ENCODING: [0x63,0xcf,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccf63
+
+MSR ICH_PPI_PRIORITYR12_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR12_EL2, x3
+// CHECK-ENCODING: [0x83,0xcf,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccf83
+
+MSR ICH_PPI_PRIORITYR13_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR13_EL2, x3
+// CHECK-ENCODING: [0xa3,0xcf,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccfa3
+
+MSR ICH_PPI_PRIORITYR14_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR14_EL2, x3
+// CHECK-ENCODING: [0xc3,0xcf,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccfc3
+
+MSR ICH_PPI_PRIORITYR15_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR15_EL2, x3
+// CHECK-ENCODING: [0xe3,0xcf,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccfe3
+
+MSR ICH_VCTLR_EL2, x3
+// CHECK-INST:    msr ICH_VCTLR_EL2, x3
+// CHECK-ENCODING: [0x83,0xcb,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccb83
+
+// -----------------------------------------------
+// FEAT_GCIE Instructions
+// Current Interrupt Domain
+GIC CDAFF, x3
+// CHECK-INST:    gic cdaff, x3
+// CHECK-ENCODING: [0x63,0xc1,0x08,0xd5]
+// CHECK-UNKNOWN: d508c163 sys #0, c12, c1, #3, x3
+// CHECK-ERROR: error: GIC cdaff requires: gcie
+
+GIC CDDI, x3
+// CHECK-INST:    gic cddi, x3
+// CHECK-ENCODING: [0x03,0xc2,0x08,0xd5]
+// CHECK-UNKNOWN: d508c203 sys #0, c12, c2, #0, x3
+// CHECK-ERROR: error: GIC cddi requires: gcie
+
+GIC CDDIS, x3
+// CHECK-INST:    gic cddis, x3
+// CHECK-ENCODING: [0x03,0xc1,0x08,0xd5]
+// CHECK-UNKNOWN: d508c103 sys #0, c12, c1, #0, x3
+// CHECK-ERROR: error: GIC cddis requires: gcie
+
+GIC CDEN, x3
+// CHECK-INST:    gic cden, x3
+// CHECK-ENCODING: [0x23,0xc1,0x08,0xd5]
+// CHECK-UNKNOWN: d508c123 sys #0, c12, c1, #1, x3
+// CHECK-ERROR: error: GIC cden requires: gcie
+
+GIC CDEOI, x3
+// CHECK-INST:    gic cdeoi, x3
+// CHECK-ENCODING: [0xe3,0xc1,0x08,0xd5]
+// CHECK-UNKNOWN: d508c1e3 sys #0, c12, c1, #7, x3
+// CHECK-ERROR: error: GIC cdeoi requires: gcie
+
+GIC CDHM, x3
+// CHECK-INST:    gic cdhm, x3
+// CHECK-ENCODING: [0x23,0xc2,0x08,0xd5]
+// CHECK-UNKNOWN: d508c223 sys #0, c12, c2, #1, x3
+// CHECK-ERROR: error: GIC cdhm requires: gcie
+
+GIC CDPEND, x3
+// CHECK-INST:    gic cdpend, x3
+// CHECK-ENCODING: [0x83,0xc1,0x08,0xd5]
+// CHECK-UNKNOWN: d508c183 sys #0, c12, c1, #4, x3
+// CHECK-ERROR: error: GIC cdpend requires: gcie
+
+GIC CDPRI, x3
+// CHECK-INST:    gic cdpri, x3
+// CHECK-ENCODING: [0x43,0xc1,0x08,0xd5]
+// CHECK-UNKNOWN: d508c143 sys #0, c12, c1, #2, x3
+// CHECK-ERROR: error: GIC cdpri requires: gcie
+
+GIC CDRCFG, x3
+// CHECK-INST:    gic cdrcfg, x3
+// CHECK-ENCODING: [0xa3,0xc1,0x08,0xd5]
+// CHECK-UNKNOWN: d508c1a3 sys #0, c12, c1, #5, x3
+// CHECK-ERROR: error: GIC cdrcfg requires: gcie
+
+GICR x3, CDIA
+// CHECK-INST:    gicr x3, cdia
+// CHECK-ENCODING: [0x03,0xc3,0x28,0xd5]
+// CHECK-UNKNOWN: d528c303 sysl x3, #0, c12, c3, #0
+// CHECK-ERROR: error: GICR cdia requires: gcie
+
+GICR x3, CDNMIA
+// CHECK-INST:    gicr x3, cdnmia
+// CHECK-ENCODING: [0x23,0xc3,0x28,0xd5]
+// CHECK-UNKNOWN: d528c323 sysl x3, #0, c12, c3, #1
+// CHECK-ERROR: error: GICR cdnmia requires: gcie
+
+// -----------------------------------------------
+// Virtual Interrupt Domain
+GIC VDAFF, x3
+// CHECK-INST:    gic vdaff, x3
+// CHECK-ENCODING: [0x63,0xc1,0x0c,0xd5]
+// CHECK-UNKNOWN: d50cc163 sys #4, c12, c1, #3, x3
+// CHECK-ERROR: error: GIC vdaff requires: gcie
+
+GIC VDDI, x3
+// CHECK-INST:    gic vddi, x3
+// CHECK-ENCODING: [0x03,0xc2,0x0c,0xd5]
+// CHECK-UNKNOWN: d50cc203 sys #4, c12, c2, #0, x3
+// CHECK-ERROR: error: GIC vddi requires: gcie
+
+GIC VDDIS, x3
+// CHECK-INST:    gic vddis, x3
+// CHECK-ENCODING: [0x03,0xc1,0x0c,0xd5]
+// CHECK-UNKNOWN: d50cc103 sys #4, c12, c1, #0, x3
+// CHECK-ERROR: error: GIC vddis requires: gcie
+
+GIC VDEN, x3
+// CHECK-INST:    gic vden, x3
+// CHECK-ENCODING: [0x23,0xc1,0x0c,0xd5]
+// CHECK-UNKNOWN: d50cc123 sys #4, c12, c1, #1, x3
+// CHECK-ERROR: error: GIC vden requires: gcie
+
+GIC VDHM, x3
+// CHECK-INST:    gic vdhm, x3
+// CHECK-ENCODING: [0x23,0xc2,0x0c,0xd5]
+// CHECK-UNKNOWN: d50cc223 sys #4, c12, c2, #1, x3
+// CHECK-ERROR: error: GIC vdhm requires: gcie
+
+GIC VDPEND, x3
+// CHECK-INST:    gic vdpend, x3
+// CHECK-ENCODING: [0x83,0xc1,0x0c,0xd5]
+// CHECK-UNKNOWN: d50cc183 sys #4, c12, c1, #4, x3
+// CHECK-ERROR: error: GIC vdpend requires: gcie
+
+GIC VDPRI, x3
+// CHECK-INST:    gic vdpri, x3
+// CHECK-ENCODING: [0x43,0xc1,0x0c,0xd5]
+// CHECK-UNKNOWN: d50cc143 sys #4, c12, c1, #2, x3
+// CHECK-ERROR: error: GIC vdpri requires: gcie
+
+GIC VDRCFG, x3
+// CHECK-INST:    gic vdrcfg, x3
+// CHECK-ENCODING: [0xa3,0xc1,0x0c,0xd5]
+// CHECK-UNKNOWN: d50cc1a3 sys #4, c12, c1, #5, x3
+// CHECK-ERROR: error: GIC vdrcfg requires: gcie
+
+// -----------------------------------------------
+// Logical Interrupt Domain
+GIC LDAFF, x3
+// CHECK-INST:    gic ldaff, x3
+// CHECK-ENCODING: [0x63,0xc1,0x0e,0xd5]
+// CHECK-UNKNOWN: d50ec163 sys #6, c12, c1, #3, x3
+// CHECK-ERROR: error: GIC ldaff requires: gcie
+
+GIC LDDI, x3
+// CHECK-INST:    gic lddi, x3
+// CHECK-ENCODING: [0x03,0xc2,0x0e,0xd5]
+// CHECK-UNKNOWN: d50ec203 sys #6, c12, c2, #0, x3
+// CHECK-ERROR: error: GIC lddi requires: gcie
+
+GIC LDDIS, x3
+// CHECK-INST:    gic lddis, x3
+// CHECK-ENCODING: [0x03,0xc1,0x0e,0xd5]
+// CHECK-UNKNOWN: d50ec103 sys #6, c12, c1, #0, x3
+// CHECK-ERROR: error: GIC lddis requires: gcie
+
+GIC LDEN, x3
+// CHECK-INST:    gic lden, x3
+// CHECK-ENCODING: [0x23,0xc1,0x0e,0xd5]
+// CHECK-UNKNOWN: d50ec123 sys #6, c12, c1, #1, x3
+// CHECK-ERROR: error: GIC lden requires: gcie
+
+GIC LDHM, x3
+// CHECK-INST:    gic ldhm, x3
+// CHECK-ENCODING: [0x23,0xc2,0x0e,0xd5]
+// CHECK-UNKNOWN: d50ec223 sys #6, c12, c2, #1, x3
+// CHECK-ERROR: error: GIC ldhm requires: gcie
+
+GIC LDPEND, x3
+// CHECK-INST:    gic ldpend, x3
+// CHECK-ENCODING: [0x83,0xc1,0x0e,0xd5]
+// CHECK-UNKNOWN: d50ec183 sys #6, c12, c1, #4, x3
+// CHECK-ERROR: error: GIC ldpend requires: gcie
+
+GIC LDPRI, x3
+// CHECK-INST:    gic ldpri, x3
+// CHECK-ENCODING: [0x43,0xc1,0x0e,0xd5]
+// CHECK-UNKNOWN: d50ec143 sys #6, c12, c1, #2, x3
+// CHECK-ERROR: error: GIC ldpri requires: gcie
+
+GIC LDRCFG, x3
+// CHECK-INST:    gic ldrcfg, x3
+// CHECK-ENCODING: [0xa3,0xc1,0x0e,0xd5]
+// CHECK-UNKNOWN: d50ec1a3 sys #6, c12, c1, #5, x3
+// CHECK-ERROR: error: GIC ldrcfg requires: gcie
+
+// -----------------------------------------------
+// GIC Synchronization Barrier Instructions
+GSB SYS
+// CHECK-INST:    gsb sys
+// CHECK-ENCODING: [0x1f,0xc0,0x08,0xd5]
+// CHECK-UNKNOWN: d508c01f sys #0, c12, c0, #0
+// CHECK-ERROR: error: GSB sys requires: gcie
+
+GSB ACK
+// CHECK-INST:    gsb ack
+// CHECK-ENCODING: [0x3f,0xc0,0x08,0xd5]
+// CHECK-UNKNOWN: d508c03f sys #0, c12, c0, #1
+// CHECK-ERROR: error: GSB ack requires: gcie
diff --git a/llvm/test/MC/AArch64/armv9.7a-memsys.s b/llvm/test/MC/AArch64/armv9.7a-memsys.s
new file mode 100644
index 0000000..228c71e
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.7a-memsys.s
@@ -0,0 +1,140 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+cmh,+lscp < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+cmh,+lscp < %s \
+// RUN:        | llvm-objdump -d --mattr=+cmh,+lscp --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+cmh,+lscp < %s \
+// RUN:        | llvm-objdump -d --mattr=-cmh,-lscp --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+cmh,+lscp < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+cmh,+lscp -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+// Armv9.7-A Contention Management Hints (FEAT_CMH).
+
+shuh
+// CHECK-INST: shuh
+// CHECK-ENCODING: encoding: [0x5f,0x26,0x03,0xd5]
+// CHECK-ERROR: error: instruction requires: cmh
+// CHECK-UNKNOWN: d503265f hint    #50
+
+shuh ph
+// CHECK-INST: shuh  ph
+// CHECK-ENCODING: encoding: [0x7f,0x26,0x03,0xd5]
+// CHECK-ERROR: error: instruction requires: cmh
+// CHECK-UNKNOWN: d503267f hint    #51
+
+stcph
+// CHECK-INST: stcph
+// CHECK-ENCODING: [0x9f,0x26,0x03,0xd5]
+// CHECK-ERROR: error: instruction requires: cmh
+// CHECK-UNKNOWN: d503269f hint    #52
+
+ldap x0, x1, [x2]
+// CHECK-INST: ldap    x0, x1, [x2]
+// CHECK-ENCODING: encoding: [0x40,0x58,0x41,0xd9]
+// CHECK-ERROR: error: instruction requires: lscp
+// CHECK-UNKNOWN: d9415840 <unknown>
+
+ldap x0, x1, [x2, #0]
+// CHECK-INST: ldap    x0, x1, [x2]
+// CHECK-ENCODING: encoding: [0x40,0x58,0x41,0xd9]
+// CHECK-ERROR: error: instruction requires: lscp
+// CHECK-UNKNOWN: d9415840 <unknown>
+
+ldapp x0, x1, [x2]
+// CHECK-INST: ldapp   x0, x1, [x2]
+// CHECK-ENCODING: encoding: [0x40,0x78,0x41,0xd9]
+// CHECK-ERROR: error: instruction requires: lscp
+// CHECK-UNKNOWN: d9417840 <unknown>
+
+ldapp x0, x1, [x2, #0]
+// CHECK-INST: ldapp   x0, x1, [x2]
+// CHECK-ENCODING: encoding: [0x40,0x78,0x41,0xd9]
+// CHECK-ERROR: error: instruction requires: lscp
+// CHECK-UNKNOWN: d9417840 <unknown>
+
+stlp x0, x1, [x2, #0]
+// CHECK-INST: stlp    x0, x1, [x2]
+// CHECK-ENCODING: encoding: [0x40,0x58,0x01,0xd9]
+// CHECK-ERROR: error: instruction requires: lscp
+// CHECK-UNKNOWN: d9015840 <unknown>
+
+stlp x0, x1, [x2]
+// CHECK-INST: stlp    x0, x1, [x2]
+// CHECK-ENCODING: encoding: [0x40,0x58,0x01,0xd9]
+// CHECK-ERROR: error: instruction requires: lscp
+// CHECK-UNKNOWN: d9015840 <unknown>
+
+mrs x3, VTLBID0_EL2
+// CHECK-INST: mrs	x3, VTLBID0_EL2
+// CHECK-ENCODING: encoding: [0x03,0x28,0x3c,0xd5]
+// CHECK-UNKNOWN: d53c2803
+mrs x3, VTLBID1_EL2
+// CHECK-INST: mrs	x3, VTLBID1_EL2
+// CHECK-ENCODING: encoding: [0x23,0x28,0x3c,0xd5]
+// CHECK-UNKNOWN: d53c2823
+mrs x3, VTLBID2_EL2
+// CHECK-INST: mrs	x3, VTLBID2_EL2
+// CHECK-ENCODING: encoding: [0x43,0x28,0x3c,0xd5]
+// CHECK-UNKNOWN: d53c2843
+mrs x3, VTLBID3_EL2
+// CHECK-INST: mrs	x3, VTLBID3_EL2
+// CHECK-ENCODING: encoding: [0x63,0x28,0x3c,0xd5]
+// CHECK-UNKNOWN: d53c2863
+mrs x3, VTLBIDOS0_EL2
+// CHECK-INST: mrs	x3, VTLBIDOS0_EL2
+// CHECK-ENCODING: encoding: [0x03,0x29,0x3c,0xd5]
+// CHECK-UNKNOWN: d53c2903
+mrs x3, VTLBIDOS1_EL2
+// CHECK-INST: mrs	x3, VTLBIDOS1_EL2
+// CHECK-ENCODING: encoding: [0x23,0x29,0x3c,0xd5]
+// CHECK-UNKNOWN: d53c2923
+mrs x3, VTLBIDOS2_EL2
+// CHECK-INST: mrs	x3, VTLBIDOS2_EL2
+// CHECK-ENCODING: encoding: [0x43,0x29,0x3c,0xd5]
+// CHECK-UNKNOWN: d53c2943
+mrs x3, VTLBIDOS3_EL2
+// CHECK-INST: mrs	x3, VTLBIDOS3_EL2
+// CHECK-ENCODING: encoding: [0x63,0x29,0x3c,0xd5]
+// CHECK-UNKNOWN: d53c2963
+mrs x3, TLBIDIDR_EL1
+// CHECK-INST: mrs	x3, TLBIDIDR_EL1
+// CHECK-ENCODING: encoding: [0xc3,0xa4,0x38,0xd5]
+// CHECK-UNKNOWN: d538a4c3
+
+msr VTLBID0_EL2, x3
+// CHECK-INST: msr	VTLBID0_EL2, x3
+// CHECK-ENCODING: encoding: [0x03,0x28,0x1c,0xd5]
+// CHECK-UNKNOWN: d51c2803
+msr VTLBID1_EL2, x3
+// CHECK-INST: msr	VTLBID1_EL2, x3
+// CHECK-ENCODING: encoding: [0x23,0x28,0x1c,0xd5]
+// CHECK-UNKNOWN: d51c2823
+msr VTLBID2_EL2, x3
+// CHECK-INST: msr	VTLBID2_EL2, x3
+// CHECK-ENCODING: encoding: [0x43,0x28,0x1c,0xd5]
+// CHECK-UNKNOWN: d51c2843
+msr VTLBID3_EL2, x3
+// CHECK-INST: msr	VTLBID3_EL2, x3
+// CHECK-ENCODING: encoding: [0x63,0x28,0x1c,0xd5]
+// CHECK-UNKNOWN: d51c2863
+msr VTLBIDOS0_EL2, x3
+// CHECK-INST: msr	VTLBIDOS0_EL2, x3
+// CHECK-ENCODING: encoding: [0x03,0x29,0x1c,0xd5]
+// CHECK-UNKNOWN:  d51c2903
+msr VTLBIDOS1_EL2, x3
+// CHECK-INST: msr	VTLBIDOS1_EL2, x3
+// CHECK-ENCODING: encoding: [0x23,0x29,0x1c,0xd5]
+// CHECK-UNKNOWN:  d51c2923
+msr VTLBIDOS2_EL2, x3
+// CHECK-INST: msr	VTLBIDOS2_EL2, x3
+// CHECK-ENCODING: encoding: [0x43,0x29,0x1c,0xd5]
+// CHECK-UNKNOWN:  d51c2943
+msr VTLBIDOS3_EL2, x3
+// CHECK-INST: msr	VTLBIDOS3_EL2, x3
+// CHECK-ENCODING: encoding: [0x63,0x29,0x1c,0xd5]
+// CHECK-UNKNOWN:  d51c2963
+
diff --git a/llvm/test/MC/AArch64/armv9.7a-mpamv2-diagnostics.s b/llvm/test/MC/AArch64/armv9.7a-mpamv2-diagnostics.s
new file mode 100644
index 0000000..54fdc23
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.7a-mpamv2-diagnostics.s
@@ -0,0 +1,18 @@
+// RUN: not llvm-mc -triple=aarch64 -mattr=+mpamv2 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+
+//------------------------------------------------------------------------------
+// Armv9.7-A FEAT_MPAMV2 Extensions
+//------------------------------------------------------------------------------
+
+mlbi alle1, x30
+// CHECK-ERROR: error: specified mlbi op does not use a register
+
+mlbi vmalle1, x30
+// CHECK-ERROR: error: specified mlbi op does not use a register
+
+mlbi vpide1
+// CHECK-ERROR: error: specified mlbi op requires a register
+
+mlbi vpmge1
+// CHECK-ERROR: error: specified mlbi op requires a register
diff --git a/llvm/test/MC/AArch64/armv9.7a-mpamv2.s b/llvm/test/MC/AArch64/armv9.7a-mpamv2.s
new file mode 100644
index 0000000..b8b21e96
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.7a-mpamv2.s
@@ -0,0 +1,126 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+mpamv2 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+mpamv2 < %s \
+// RUN:        | llvm-objdump -d --mattr=+mpamv2 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+mpamv2 < %s \
+// RUN:        | llvm-objdump -d --mattr=-mpamv2 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+mpamv2 < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+mpamv2 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+//------------------------------------------------------------------------------
+// Armv9.7-A FEAT_MPAMV2 Extensions
+//------------------------------------------------------------------------------
+
+msr MPAMCTL_EL1, x0
+// CHECK-INST:    msr MPAMCTL_EL1, x0
+// CHECK-ENCODING: [0x40,0xa5,0x18,0xd5]
+// CHECK-UNKNOWN: d518a540
+
+msr MPAMCTL_EL12, x0
+// CHECK-INST:   msr MPAMCTL_EL12, x0
+// CHECK-ENCODING: [0x40,0xa5,0x1d,0xd5]
+// CHECK-UNKNOWN: d51da540
+
+msr MPAMCTL_EL2, x0
+// CHECK-INST:    msr     MPAMCTL_EL2, x0
+// CHECK-ENCODING: [0x40,0xa5,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ca540
+
+msr MPAMCTL_EL3, x0
+// CHECK-INST:    msr     MPAMCTL_EL3, x0
+// CHECK-ENCODING: [0x40,0xa5,0x1e,0xd5]
+// CHECK-UNKNOWN: d51ea540
+
+msr MPAMVIDCR_EL2, x0
+// CHECK-INST:    msr     MPAMVIDCR_EL2, x0
+// CHECK-ENCODING: [0x00,0xa7,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ca700
+
+msr MPAMVIDSR_EL2, x0
+// CHECK-INST:    msr     MPAMVIDSR_EL2, x0
+// CHECK-ENCODING: [0x20,0xa7,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ca720
+
+msr MPAMVIDSR_EL3, x0
+// CHECK-INST:    msr     MPAMVIDSR_EL3, x0
+// CHECK-ENCODING: [0x20,0xa7,0x1e,0xd5]
+// CHECK-UNKNOWN: d51ea720
+
+
+mrs x0, MPAMCTL_EL1
+// CHECK-INST:        mrs     x0, MPAMCTL_EL1
+// CHECK-ENCODING: [0x40,0xa5,0x38,0xd5]
+// CHECK-UNKNOWN: d538a540
+
+mrs x0, MPAMCTL_EL12
+// CHECK-INST:   mrs     x0, MPAMCTL_EL12
+// CHECK-ENCODING: [0x40,0xa5,0x3d,0xd5]
+// CHECK-UNKNOWN: d53da540
+
+mrs x0, MPAMCTL_EL2
+// CHECK-INST:   mrs     x0, MPAMCTL_EL2
+// CHECK-ENCODING: [0x40,0xa5,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ca540
+
+mrs x0, MPAMCTL_EL3
+// CHECK-INST:   mrs     x0, MPAMCTL_EL3
+// CHECK-ENCODING: [0x40,0xa5,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ea540
+
+mrs x0, MPAMVIDCR_EL2
+// CHECK-INST:   mrs     x0, MPAMVIDCR_EL2
+// CHECK-ENCODING: [0x00,0xa7,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ca700
+
+mrs x0, MPAMVIDSR_EL2
+// CHECK-INST:   mrs     x0, MPAMVIDSR_EL2
+// CHECK-ENCODING: [0x20,0xa7,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ca720
+
+mrs x0, MPAMVIDSR_EL3
+// CHECK-INST:   mrs     x0, MPAMVIDSR_EL3
+// CHECK-ENCODING: [0x20,0xa7,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ea720
+
+
+//------------------------------------------------------------------------------
+// Armv9.7-A FEAT_MPAMV2_VID Extensions
+//------------------------------------------------------------------------------
+
+mlbi vmalle1
+// CHECK-INST:    mlbi vmalle1
+// CHECK-ENCODING: [0xbf,0x70,0x0c,0xd5]
+// CHECK-UNKNOWN: d50c70bf sys	#4, c7, c0, #5
+// CHECK-ERROR: error: MLBI VMALLE1 requires: mpamv2
+
+mlbi vpide1, x0
+// CHECK-INST:    mlbi vpide1, x0
+// CHECK-ENCODING: [0xc0,0x70,0x0c,0xd5]
+// CHECK-UNKNOWN: d50c70c0 sys	#4, c7, c0, #6, x0
+// CHECK-ERROR: error: MLBI VPIDE1 requires: mpamv2
+
+mlbi vpmge1, x0
+// CHECK-INST:    mlbi vpmge1, x0
+// CHECK-ENCODING: [0xe0,0x70,0x0c,0xd5]
+// CHECK-UNKNOWN: d50c70e0 sys	#4, c7, c0, #7, x0
+// CHECK-ERROR: error: MLBI VPMGE1 requires: mpamv2
+
+// Check that invalid encodings are rendered as SYS aliases
+// [0x9f,0x70,0x0c,0xd5] -> mlbi alle1
+// [0x9e,0x70,0x0c,0xd5] -> sys #4, c7, c0, #4, x30
+
+mlbi alle1
+// CHECK-INST:    mlbi alle1
+// CHECK-ENCODING: [0x9f,0x70,0x0c,0xd5]
+// CHECK-UNKNOWN: d50c709f sys	#4, c7, c0, #4
+// CHECK-ERROR: error: MLBI ALLE1 requires: mpamv2
+
+sys #4, c7, c0, #4, x30
+// CHECK-INST: sys #4, c7, c0, #4, x30
+// CHECK-ENCODING: [0x9e,0x70,0x0c,0xd5]
+// CHECK-UNKNOWN: d50c709e sys #4, c7, c0, #4, x30
diff --git a/llvm/test/MC/AArch64/armv9.7a-mtetc-diagnostics.s b/llvm/test/MC/AArch64/armv9.7a-mtetc-diagnostics.s
new file mode 100644
index 0000000..dc2a290
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.7a-mtetc-diagnostics.s
@@ -0,0 +1,16 @@
+// RUN: not llvm-mc -triple=aarch64 -mattr=+mtetc -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-REQUIRES-MTETC
+
+//------------------------------------------------------------------------------
+// FEAT_MTETC Extension instructions
+//------------------------------------------------------------------------------
+
+dc zgbva
+// CHECK-ERROR: error: specified dc op requires a register
+// CHECK-REQUIRES-MTETC: DC ZGBVA requires: mtetc
+
+dc gbva
+// CHECK-ERROR: error: specified dc op requires a register
+// CHECK-REQUIRES-MTETC: DC GBVA requires: mtetc
diff --git a/llvm/test/MC/AArch64/armv9.7a-mtetc.s b/llvm/test/MC/AArch64/armv9.7a-mtetc.s
new file mode 100644
index 0000000..087b23b
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.7a-mtetc.s
@@ -0,0 +1,29 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+mtetc < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+mtetc < %s \
+// RUN:        | llvm-objdump -d --mattr=+mtetc --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+mtetc < %s \
+// RUN:        | llvm-objdump -d --mattr=-mtetc --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+mtetc < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+mtetc -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+//------------------------------------------------------------------------------
+// FEAT_MTETC Extension instructions
+//------------------------------------------------------------------------------
+
+dc zgbva, x0
+// CHECK-INST:    dc zgbva, x0
+// CHECK-ENCODING: [0xa0,0x74,0x0b,0xd5]
+// CHECK-UNKNOWN: d50b74a0 sys #3, c7, c4, #5, x0
+// CHECK-ERROR: DC ZGBVA requires: mtetc
+
+dc gbva, x0
+// CHECK-INST:    dc gbva, x0
+// CHECK-ENCODING: [0xe0,0x74,0x0b,0xd5]
+// CHECK-UNKNOWN: d50b74e0 sys #3, c7, c4, #7, x0
+// CHECK-ERROR: DC GBVA requires: mtetc
diff --git a/llvm/test/MC/AArch64/armv9.7a-tlbid-diagnostics.s b/llvm/test/MC/AArch64/armv9.7a-tlbid-diagnostics.s
new file mode 100644
index 0000000..2440fd3
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.7a-tlbid-diagnostics.s
@@ -0,0 +1,64 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+tlb-rmi,+tlbiw,+rme < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+tlb-rmi,+tlbiw,+tlbid,+rme < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-NO-REGISTER
+
+// Test without using +tlbid - no optional register operand allowed
+
+tlbi vmalle1is, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi vmalle1is, x5
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi vmalle1os, x5
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi vmalls12e1os, x5
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi alle1is, x5
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi alle2is, x5
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi alle3is, x5
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi vmallws2e1os, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi vmalls12e1is, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi vmallws2e1is, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi vmalle1, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-NO-REGISTER: error: specified tlbi op does not use a register
+
+tlbi alle1, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-NO-REGISTER: error: specified tlbi op does not use a register
+
+tlbi alle2, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-NO-REGISTER: error: specified tlbi op does not use a register
+
+tlbi alle3, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-NO-REGISTER: error: specified tlbi op does not use a register
+
+tlbi vmalls12e1, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-NO-REGISTER: error: specified tlbi op does not use a register
+
+tlbi paallos, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-NO-REGISTER: error: specified tlbi op does not use a register
+
+tlbi paall, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-NO-REGISTER: error: specified tlbi op does not use a register
diff --git a/llvm/test/MC/AArch64/armv9.7a-tlbid.s b/llvm/test/MC/AArch64/armv9.7a-tlbid.s
new file mode 100644
index 0000000..1362bd3
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.7a-tlbid.s
@@ -0,0 +1,84 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+tlbid,+tlb-rmi,+tlbiw < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+tlb-rmi,+tlbiw < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+tlbid,+tlb-rmi,+tlbiw < %s \
+// RUN:        | llvm-objdump -d --mattr=+tlbid,+tlb-rmi,+tlbiw --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+tlbid,+tlb-rmi,+tlbiw < %s \
+// RUN:        | llvm-objdump -d --mattr=-tlbid --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+tlbid,+tlb-rmi,+tlbiw < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+tlbid,+tlb-rmi,+tlbiw -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+// Armv9.7-A TLBI Domains (FEAT_TLBID)
+
+tlbi vmalle1is
+// CHECK-INST: tlbi vmalle1is
+// CHECK-ENCODING: encoding: [0x1f,0x83,0x08,0xd5]
+// CHECK-UNKNOWN: d508831f tlbi vmalle1is
+
+tlbi vmalle1is, xzr
+// CHECK-INST: tlbi vmalle1is
+// CHECK-ENCODING: encoding: [0x1f,0x83,0x08,0xd5]
+// CHECK-UNKNOWN: d508831f tlbi vmalle1is
+
+tlbi vmalle1is, x31
+// CHECK-INST: tlbi vmalle1is
+// CHECK-ENCODING: encoding: [0x1f,0x83,0x08,0xd5]
+// CHECK-UNKNOWN: d508831f tlbi vmalle1is
+
+tlbi vmalle1is, x5
+// CHECK-INST: tlbi vmalle1is, x5
+// CHECK-ENCODING: encoding: [0x05,0x83,0x08,0xd5]
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-UNKNOWN: d5088305 sys	#0, c8, c3, #0, x5
+
+tlbi vmalle1os, x5
+// CHECK-INST: tlbi vmalle1os, x5
+// CHECK-ENCODING: encoding: [0x05,0x81,0x08,0xd5]
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-UNKNOWN: d5088105 sys	#0, c8, c1, #0, x5
+
+tlbi alle1is, x5
+// CHECK-INST: tlbi alle1is, x5
+// CHECK-ENCODING: encoding: [0x85,0x83,0x0c,0xd5]
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-UNKNOWN: d50c8385 sys	#4, c8, c3, #4, x5
+
+tlbi alle2is, x5
+// CHECK-INST: tlbi alle2is, x5
+// CHECK-ENCODING: encoding: [0x05,0x83,0x0c,0xd5]
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-UNKNOWN: d50c8305 sys	#4, c8, c3, #0, x5
+
+tlbi alle3is, x5
+// CHECK-INST: tlbi alle3is, x5
+// CHECK-ENCODING: encoding: [0x05,0x83,0x0e,0xd5]
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-UNKNOWN: d50e8305 sys	#6, c8, c3, #0, x5
+
+tlbi vmalls12e1is, x1
+// CHECK-INST: tlbi vmalls12e1is, x1
+// CHECK-ENCODING: encoding: [0xc1,0x83,0x0c,0xd5]
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-UNKNOWN: d50c83c1 sys	#4, c8, c3, #6, x1
+
+tlbi vmalls12e1os, x5
+// CHECK-INST: tlbi vmalls12e1os, x5
+// CHECK-ENCODING: encoding: [0xc5,0x81,0x0c,0xd5]
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-UNKNOWN: d50c81c5 sys	#4, c8, c1, #6, x5
+
+tlbi vmallws2e1is, x1
+// CHECK-INST: tlbi vmallws2e1is, x1
+// CHECK-ENCODING: encoding: [0x41,0x82,0x0c,0xd5]
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-UNKNOWN: d50c8241 sys	#4, c8, c2, #2, x1
+
+tlbi vmallws2e1os, x1
+// CHECK-INST: tlbi vmallws2e1os, x1
+// CHECK-ENCODING: encoding: [0x41,0x85,0x0c,0xd5]
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-UNKNOWN: d50c8541 sys	#4, c8, c5, #2, x1
diff --git a/llvm/test/MC/AArch64/neon-fdot-diagnostics.s b/llvm/test/MC/AArch64/neon-fdot-diagnostics.s
new file mode 100644
index 0000000..4f5f557
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-fdot-diagnostics.s
@@ -0,0 +1,59 @@
+// RUN: not llvm-mc -triple=aarch64 -mattr=f16f32dot 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid operand
+
+fdot v0.2s, v0.4b, v0.4b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot v0.2s, v0.4b, v0.4b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot v0.2b, v0.4b, v0.4b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot v0.2b, v0.4b, v0.4b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot v0.2s, v0.4s, v0.4s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot v0.2s, v0.4s, v0.4s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot v0.2h, v0.4h, v0.4h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot v0.2h, v0.4h, v0.4h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// fdot indexed
+
+fdot v0.2s, v0.4b, v0.4b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot v0.2s, v0.4b, v0.4b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot v0.2b, v0.4b, v0.4b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot v0.2b, v0.4b, v0.4b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot v0.2s, v0.4s, v0.4s[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot v0.2s, v0.4s, v0.4s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot v0.2h, v0.4h, v0.4h[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot v0.2h, v0.4h, v0.4h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid immediate range
+
+fdot v0.2s, v0.4h, v0.2h[-1]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
+// CHECK-NEXT: fdot v0.2s, v0.4h, v0.2h[-1]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot v0.2s, v0.4h, v0.2h[4]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
+// CHECK-NEXT: fdot v0.2s, v0.4h, v0.2h[4]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/neon-fdot.s b/llvm/test/MC/AArch64/neon-fdot.s
new file mode 100644
index 0000000..c8a8e2f
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-fdot.s
@@ -0,0 +1,147 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+f16f32dot < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+f16f32dot < %s \
+// RUN:        | llvm-objdump -d --mattr=+f16f32dot --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+f16f32dot < %s \
+// RUN:        | llvm-objdump -d --mattr=-f16f32dot --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+f16f32dot < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+f16f32dot -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+fdot v0.2s, v0.4h, v0.4h
+// CHECK-INST: fdot v0.2s, v0.4h, v0.4h
+// CHECK-ENCODING: encoding: [0x00,0xfc,0x80,0x0e]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0e80fc00 <unknown>
+
+fdot v10.2s, v10.4h, v10.4h
+// CHECK-INST: fdot v10.2s, v10.4h, v10.4h
+// CHECK-ENCODING: encoding: [0x4a,0xfd,0x8a,0x0e]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0e8afd4a <unknown>
+
+fdot v31.2s, v31.4h, v31.4h
+// CHECK-INST: fdot v31.2s, v31.4h, v31.4h
+// CHECK-ENCODING: encoding: [0xff,0xff,0x9f,0x0e]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0e9fffff <unknown>
+
+fdot v0.4s, v0.8h, v0.8h
+// CHECK-INST: fdot v0.4s, v0.8h, v0.8h
+// CHECK-ENCODING: encoding: [0x00,0xfc,0x80,0x4e]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4e80fc00 <unknown>
+
+fdot v10.4s, v10.8h, v10.8h
+// CHECK-INST: fdot v10.4s, v10.8h, v10.8h
+// CHECK-ENCODING: encoding: [0x4a,0xfd,0x8a,0x4e]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4e8afd4a <unknown>
+
+fdot v31.4s, v31.8h, v31.8h
+// CHECK-INST: fdot v31.4s, v31.8h, v31.8h
+// CHECK-ENCODING: encoding: [0xff,0xff,0x9f,0x4e]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4e9fffff <unknown>
+
+// fdot indexed
+
+fdot v0.2s, v0.4h, v0.2h[0]
+// CHECK-INST: fdot v0.2s, v0.4h, v0.2h[0]
+// CHECK-ENCODING: encoding: [0x00,0x90,0x40,0x0f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0f409000 <unknown>
+
+fdot v10.2s, v0.4h, v0.2h[0]
+// CHECK-INST: fdot v10.2s, v0.4h, v0.2h[0]
+// CHECK-ENCODING: encoding: [0x0a,0x90,0x40,0x0f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0f40900a <unknown>
+
+fdot v21.2s, v0.4h, v0.2h[0]
+// CHECK-INST: fdot v21.2s, v0.4h, v0.2h[0]
+// CHECK-ENCODING: encoding: [0x15,0x90,0x40,0x0f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0f409015 <unknown>
+
+fdot v31.2s, v0.4h, v0.2h[0]
+// CHECK-INST: fdot v31.2s, v0.4h, v0.2h[0]
+// CHECK-ENCODING: encoding: [0x1f,0x90,0x40,0x0f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0f40901f <unknown>
+
+fdot v0.2s, v10.4h, v0.2h[0]
+// CHECK-INST: fdot v0.2s, v10.4h, v0.2h[0]
+// CHECK-ENCODING: encoding: [0x40,0x91,0x40,0x0f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0f409140 <unknown>
+
+fdot v10.2s, v10.4h, v0.2h[0]
+// CHECK-INST: fdot v10.2s, v10.4h, v0.2h[0]
+// CHECK-ENCODING: encoding: [0x4a,0x91,0x40,0x0f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0f40914a <unknown>
+
+fdot v21.2s, v10.4h, v0.2h[0]
+// CHECK-INST: fdot v21.2s, v10.4h, v0.2h[0]
+// CHECK-ENCODING: encoding: [0x55,0x91,0x40,0x0f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0f409155 <unknown>
+
+fdot v31.2s, v10.4h, v0.2h[0]
+// CHECK-INST: fdot v31.2s, v10.4h, v0.2h[0]
+// CHECK-ENCODING: encoding: [0x5f,0x91,0x40,0x0f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0f40915f <unknown>
+
+fdot v0.4s, v21.8h, v31.2h[3]
+// CHECK-INST: fdot v0.4s, v21.8h, v31.2h[3]
+// CHECK-ENCODING: encoding: [0xa0,0x9a,0x7f,0x4f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4f7f9aa0 <unknown>
+
+fdot v10.4s, v21.8h, v31.2h[3]
+// CHECK-INST: fdot v10.4s, v21.8h, v31.2h[3]
+// CHECK-ENCODING: encoding: [0xaa,0x9a,0x7f,0x4f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4f7f9aaa <unknown>
+
+fdot v21.4s, v21.8h, v31.2h[3]
+// CHECK-INST: fdot v21.4s, v21.8h, v31.2h[3]
+// CHECK-ENCODING: encoding: [0xb5,0x9a,0x7f,0x4f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4f7f9ab5 <unknown>
+
+fdot v31.4s, v21.8h, v31.2h[3]
+// CHECK-INST: fdot v31.4s, v21.8h, v31.2h[3]
+// CHECK-ENCODING: encoding: [0xbf,0x9a,0x7f,0x4f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4f7f9abf <unknown>
+
+fdot v0.4s, v31.8h, v31.2h[3]
+// CHECK-INST: fdot v0.4s, v31.8h, v31.2h[3]
+// CHECK-ENCODING: encoding: [0xe0,0x9b,0x7f,0x4f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4f7f9be0 <unknown>
+
+fdot v10.4s, v31.8h, v31.2h[3]
+// CHECK-INST: fdot v10.4s, v31.8h, v31.2h[3]
+// CHECK-ENCODING: encoding: [0xea,0x9b,0x7f,0x4f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4f7f9bea <unknown>
+
+fdot v21.4s, v31.8h, v31.2h[3]
+// CHECK-INST: fdot v21.4s, v31.8h, v31.2h[3]
+// CHECK-ENCODING: encoding: [0xf5,0x9b,0x7f,0x4f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4f7f9bf5 <unknown>
+
+fdot v31.4s, v31.8h, v31.2h[3]
+// CHECK-INST: fdot v31.4s, v31.8h, v31.2h[3]
+// CHECK-ENCODING: encoding: [0xff,0x9b,0x7f,0x4f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4f7f9bff <unknown>
diff --git a/llvm/test/MC/AArch64/neon-fmmla-HtoS-diagnostics.s b/llvm/test/MC/AArch64/neon-fmmla-HtoS-diagnostics.s
new file mode 100644
index 0000000..ccc0742
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-fmmla-HtoS-diagnostics.s
@@ -0,0 +1,24 @@
+// RUN: not llvm-mc -triple=aarch64 -mattr=+f16f32mm 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid operand/vector
+
+fmmla v0.4b, v0.8b, v0.8b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fmmla v0.4b, v0.8b, v0.8b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmmla v0.4h, v0.8h, v0.8h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fmmla v0.4h, v0.8h, v0.8h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmmla v0.4s, v0.8s, v0.8s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid vector kind qualifier
+// CHECK-NEXT: fmmla v0.4s, v0.8s, v0.8s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmmla v0.4d, v0.8d, v0.8d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid vector kind qualifier
+// CHECK-NEXT: fmmla v0.4d, v0.8d, v0.8d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/neon-fmmla-HtoS.s b/llvm/test/MC/AArch64/neon-fmmla-HtoS.s
new file mode 100644
index 0000000..6b3d352
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-fmmla-HtoS.s
@@ -0,0 +1,37 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+f16f32mm< %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+f16f32mm < %s \
+// RUN:        | llvm-objdump -d --mattr=+f16f32mm --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+f16f32mm < %s \
+// RUN:        | llvm-objdump -d --mattr=-f16f32mm --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+f16f32mm < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+f16f32mm -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+fmmla v0.4s, v0.8h, v0.8h
+// CHECK-INST: fmmla v0.4s, v0.8h, v0.8h
+// CHECK-ENCODING: encoding: [0x00,0xec,0x40,0x4e]
+// CHECK-ERROR: instruction requires: f16f32mm
+// CHECK-UNKNOWN: 4e40ec00 <unknown>
+
+fmmla v10.4s, v10.8h, v10.8h
+// CHECK-INST: fmmla v10.4s, v10.8h, v10.8h
+// CHECK-ENCODING: encoding: [0x4a,0xed,0x4a,0x4e]
+// CHECK-ERROR: instruction requires: f16f32mm
+// CHECK-UNKNOWN: 4e4aed4a <unknown>
+
+fmmla v21.4s, v21.8h, v21.8h
+// CHECK-INST: fmmla v21.4s, v21.8h, v21.8h
+// CHECK-ENCODING: encoding: [0xb5,0xee,0x55,0x4e]
+// CHECK-ERROR: instruction requires: f16f32mm
+// CHECK-UNKNOWN: 4e55eeb5 <unknown>
+
+fmmla v31.4s, v31.8h, v31.8h
+// CHECK-INST: fmmla v31.4s, v31.8h, v31.8h
+// CHECK-ENCODING: encoding: [0xff,0xef,0x5f,0x4e]
+// CHECK-ERROR: instruction requires: f16f32mm
+// CHECK-UNKNOWN: 4e5fefff <unknown>
diff --git a/llvm/test/MC/AArch64/neon-fmmla-diagnostics.s b/llvm/test/MC/AArch64/neon-fmmla-diagnostics.s
new file mode 100644
index 0000000..7fc5373
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-fmmla-diagnostics.s
@@ -0,0 +1,24 @@
+// RUN: not llvm-mc -triple=aarch64 -mattr=+f16mm 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid operand/vector
+
+fmmla v0.8b, v0.8b, v0.8b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fmmla v0.8b, v0.8b, v0.8b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmmla v0.8b, v0.8h, v0.8h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fmmla v0.8b, v0.8h, v0.8h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmmla v0.8s, v0.8h, v0.8h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid vector kind qualifier
+// CHECK-NEXT: fmmla v0.8s, v0.8h, v0.8h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmmla v0.8d, v0.8h, v0.8h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid vector kind qualifier
+// CHECK-NEXT: fmmla v0.8d, v0.8h, v0.8h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/neon-fmmla.s b/llvm/test/MC/AArch64/neon-fmmla.s
new file mode 100644
index 0000000..f35c2fb
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-fmmla.s
@@ -0,0 +1,37 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+f16mm< %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+f16mm < %s \
+// RUN:        | llvm-objdump -d --mattr=+f16mm --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+f16mm < %s \
+// RUN:        | llvm-objdump -d --mattr=-f16mm --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+f16mm < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+f16mm -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+fmmla v0.8h, v0.8h, v0.8h
+// CHECK-INST: fmmla v0.8h, v0.8h, v0.8h
+// CHECK-ENCODING: encoding: [0x00,0xec,0xc0,0x4e]
+// CHECK-ERROR: instruction requires: f16mm
+// CHECK-UNKNOWN: 4ec0ec00 <unknown>
+
+fmmla v10.8h, v10.8h, v10.8h
+// CHECK-INST: fmmla v10.8h, v10.8h, v10.8h
+// CHECK-ENCODING: encoding: [0x4a,0xed,0xca,0x4e]
+// CHECK-ERROR: instruction requires: f16mm
+// CHECK-UNKNOWN: 4ecaed4a <unknown>
+
+fmmla v21.8h, v21.8h, v21.8h
+// CHECK-INST: fmmla v21.8h, v21.8h, v21.8h
+// CHECK-ENCODING: encoding: [0xb5,0xee,0xd5,0x4e]
+// CHECK-ERROR: instruction requires: f16mm
+// CHECK-UNKNOWN: 4ed5eeb5 <unknown>
+
+fmmla v31.8h, v31.8h, v31.8h
+// CHECK-INST: fmmla v31.8h, v31.8h, v31.8h
+// CHECK-ENCODING: encoding: [0xff,0xef,0xdf,0x4e]
+// CHECK-ERROR: instruction requires: f16mm
+// CHECK-UNKNOWN: 4edfefff <unknown>
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_sopp.s b/llvm/test/MC/AMDGPU/gfx1250_asm_sopp.s
index 4f7ca47..358fe0b 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_sopp.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_sopp.s
@@ -45,6 +45,10 @@ s_set_vgpr_msb 255
 // GFX1250: [0xff,0x00,0x86,0xbf]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
+s_set_vgpr_msb 0xffff
+// GFX1250: [0xff,0xff,0x86,0xbf]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
 s_monitor_sleep 1
 // GFX1250: s_monitor_sleep 1                       ; encoding: [0x01,0x00,0x84,0xbf]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_err.s b/llvm/test/MC/AMDGPU/gfx1250_err.s
index 9d1131e..676eb48 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_err.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_err.s
@@ -1,15 +1,5 @@
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s 2>&1 | FileCheck --check-prefixes=GFX1250-ERR --implicit-check-not=error: -strict-whitespace %s
 
-s_set_vgpr_msb -1
-// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: s_set_vgpr_msb accepts values in range [0..255]
-// GFX1250-ERR: s_set_vgpr_msb -1
-// GFX1250-ERR:                ^
-
-s_set_vgpr_msb 256
-// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: s_set_vgpr_msb accepts values in range [0..255]
-// GFX1250-ERR: s_set_vgpr_msb 256
-// GFX1250-ERR:                ^
-
 s_load_b32 s4, s[2:3], 10 th:TH_LOAD_NT th:TH_LOAD_NT
 // GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 // GFX1250-ERR: s_load_b32 s4, s[2:3], 10 th:TH_LOAD_NT th:TH_LOAD_NT
diff --git a/llvm/test/MC/Disassembler/AArch64/armv8.4a-mpam.txt b/llvm/test/MC/Disassembler/AArch64/armv8.4a-mpam.txt
index ad22000..16eba25 100644
--- a/llvm/test/MC/Disassembler/AArch64/armv8.4a-mpam.txt
+++ b/llvm/test/MC/Disassembler/AArch64/armv8.4a-mpam.txt
@@ -65,35 +65,39 @@
 #CHECK:  mrs x0, MPAMVPM7_EL2
 #CHECK:  mrs x0, MPAMIDR_EL1
 
-#CHECK-NOV84:  msr S3_0_C10_C5_1, x0
-#CHECK-NOV84:  msr S3_0_C10_C5_0, x0
-#CHECK-NOV84:  msr S3_4_C10_C5_0, x0
-#CHECK-NOV84:  msr S3_6_C10_C5_0, x0
-#CHECK-NOV84:  msr S3_5_C10_C5_0, x0
-#CHECK-NOV84:  msr S3_4_C10_C4_0, x0
-#CHECK-NOV84:  msr S3_4_C10_C4_1, x0
-#CHECK-NOV84:  msr S3_4_C10_C6_0, x0
-#CHECK-NOV84:  msr S3_4_C10_C6_1, x0
-#CHECK-NOV84:  msr S3_4_C10_C6_2, x0
-#CHECK-NOV84:  msr S3_4_C10_C6_3, x0
-#CHECK-NOV84:  msr S3_4_C10_C6_4, x0
-#CHECK-NOV84:  msr S3_4_C10_C6_5, x0
-#CHECK-NOV84:  msr S3_4_C10_C6_6, x0
-#CHECK-NOV84:  msr S3_4_C10_C6_7, x0
-#CHECK-NOV84:  mrs x0, S3_0_C10_C5_1
-#CHECK-NOV84:  mrs x0, S3_0_C10_C5_0
-#CHECK-NOV84:  mrs x0, S3_4_C10_C5_0
-#CHECK-NOV84:  mrs x0, S3_6_C10_C5_0
-#CHECK-NOV84:  mrs x0, S3_5_C10_C5_0
-#CHECK-NOV84:  mrs x0, S3_4_C10_C4_0
-#CHECK-NOV84:  mrs x0, S3_4_C10_C4_1
-#CHECK-NOV84:  mrs x0, S3_4_C10_C6_0
-#CHECK-NOV84:  mrs x0, S3_4_C10_C6_1
-#CHECK-NOV84:  mrs x0, S3_4_C10_C6_2
-#CHECK-NOV84:  mrs x0, S3_4_C10_C6_3
-#CHECK-NOV84:  mrs x0, S3_4_C10_C6_4
-#CHECK-NOV84:  mrs x0, S3_4_C10_C6_5
-#CHECK-NOV84:  mrs x0, S3_4_C10_C6_6
-#CHECK-NOV84:  mrs x0, S3_4_C10_C6_7
-#CHECK-NOV84:  mrs x0, S3_0_C10_C4_4
+// Available outside MPAM from Armv9.7
+#CHECK-NOV84: msr MPAM0_EL1, x0
+#CHECK-NOV84: msr MPAM1_EL1, x0
+#CHECK-NOV84: msr MPAM2_EL2, x0
+#CHECK-NOV84: msr MPAM3_EL3, x0
+#CHECK-NOV84: msr MPAM1_EL12, x0
+#CHECK-NOV84: msr MPAMHCR_EL2, x0
 
+#CHECK-NOV84:  msr MPAMVPMV_EL2, x0
+#CHECK-NOV84:  msr MPAMVPM0_EL2, x0
+#CHECK-NOV84:  msr MPAMVPM1_EL2, x0
+#CHECK-NOV84:  msr MPAMVPM2_EL2, x0
+#CHECK-NOV84:  msr MPAMVPM3_EL2, x0
+#CHECK-NOV84:  msr MPAMVPM4_EL2, x0
+#CHECK-NOV84:  msr MPAMVPM5_EL2, x0
+#CHECK-NOV84:  msr MPAMVPM6_EL2, x0
+#CHECK-NOV84:  msr MPAMVPM7_EL2, x0
+
+// Available outside MPAM from Armv9.7
+#CHECK-NOV84: mrs x0, MPAM0_EL1
+#CHECK-NOV84: mrs x0, MPAM1_EL1
+#CHECK-NOV84: mrs x0, MPAM2_EL2
+#CHECK-NOV84: mrs x0, MPAM3_EL3
+#CHECK-NOV84: mrs x0, MPAM1_EL12
+#CHECK-NOV84: mrs x0, MPAMHCR_EL2
+
+#CHECK-NOV84:  mrs x0, MPAMVPMV_EL2
+#CHECK-NOV84:  mrs x0, MPAMVPM0_EL2
+#CHECK-NOV84:  mrs x0, MPAMVPM1_EL2
+#CHECK-NOV84:  mrs x0, MPAMVPM2_EL2
+#CHECK-NOV84:  mrs x0, MPAMVPM3_EL2
+#CHECK-NOV84:  mrs x0, MPAMVPM4_EL2
+#CHECK-NOV84:  mrs x0, MPAMVPM5_EL2
+#CHECK-NOV84:  mrs x0, MPAMVPM6_EL2
+#CHECK-NOV84:  mrs x0, MPAMVPM7_EL2
+#CHECK-NOV84:  mrs x0, MPAMIDR_EL1
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sopp.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sopp.txt
index a8627d6..b84324b 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sopp.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sopp.txt
@@ -33,6 +33,9 @@
 # GFX1250: s_set_vgpr_msb 0xff ; encoding: [0xff,0x00,0x86,0xbf]
 0xff,0x00,0x86,0xbf
 
+# GFX1250: s_set_vgpr_msb 0xffff ; encoding: [0xff,0xff,0x86,0xbf]
+0xff,0xff,0x86,0xbf
+
 # GFX1250: s_monitor_sleep 0                       ; encoding: [0x00,0x00,0x84,0xbf]
 0x00,0x00,0x84,0xbf
 
diff --git a/llvm/test/MC/Disassembler/Xtensa/debug.txt b/llvm/test/MC/Disassembler/Xtensa/debug.txt
index 1321f09..5438760 100644
--- a/llvm/test/MC/Disassembler/Xtensa/debug.txt
+++ b/llvm/test/MC/Disassembler/Xtensa/debug.txt
@@ -9,7 +9,7 @@
 # CHECK-DEBUG: break 1, 1
 # CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
 
-[0x2c,0xf1]
+[0x2d,0xf1]
 # CHECK-DEBUG: break.n 1
 # CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
 
diff --git a/llvm/test/MC/ELF/cfi-sframe-cfi-escape-diagnostics.s b/llvm/test/MC/ELF/cfi-sframe-cfi-escape-diagnostics.s
new file mode 100644
index 0000000..cb44a76
--- /dev/null
+++ b/llvm/test/MC/ELF/cfi-sframe-cfi-escape-diagnostics.s
@@ -0,0 +1,36 @@
+# RUN: llvm-mc --filetype=obj --gsframe -triple x86_64 %s -o %t.o 2>&1 | FileCheck %s
+# RUN: llvm-readelf --sframe %t.o | FileCheck %s --check-prefix=NOFDES
+
+## Tests that .cfi_escape sequences that are unrepresentable in sframe warn
+## and do not produce FDEs.
+
+        .align 1024
+cfi_escape_sp:
+        .cfi_startproc
+        .long 0
+## Setting SP via other registers makes it unrepresentable in sframe
+## DW_CFA_expression,reg 0x7,length 2,DW_OP_breg6,SLEB(-8)
+# CHECK: {{.*}}.s:[[#@LINE+1]]:9: warning: skipping SFrame FDE; .cfi_escape DW_CFA_expression with SP reg 7
+        .cfi_escape 0x10, 0x7, 0x2, 0x76, 0x78
+        .long 0
+.cfi_endproc
+
+cfi_escape_args_sp:
+        .cfi_startproc
+        .long 0
+## DW_CFA_GNU_args_size is not OK if cfa is SP
+# CHECK: {{.*}}.s:[[#@LINE+1]]:9: warning: skipping SFrame FDE; .cfi_escape DW_CFA_GNU_args_size with non frame-pointer CFA
+        .cfi_escape 0x2e, 0x20
+        .cfi_endproc
+
+cfi_escape_val_offset:
+        .cfi_startproc
+        .long 0
+        .cfi_def_cfa_offset 16
+## DW_CFA_val_offset,rbp,ULEB scaled offset(16)
+# CHECK: {{.*}}.s:[[#@LINE+1]]:9: warning: skipping SFrame FDE;  .cfi_escape DW_CFA_val_offset with FP reg 6
+        .cfi_escape 0x14,0x6,0x2
+        .long 0
+        .cfi_endproc
+
+# NOFDES: Num FDEs: 0
diff --git a/llvm/test/MC/ELF/cfi-sframe-cfi-escape.s b/llvm/test/MC/ELF/cfi-sframe-cfi-escape.s
new file mode 100644
index 0000000..df8e7d2
--- /dev/null
+++ b/llvm/test/MC/ELF/cfi-sframe-cfi-escape.s
@@ -0,0 +1,46 @@
+# RUN: llvm-mc --filetype=obj --gsframe -triple x86_64 %s -o %t.o
+# RUN: llvm-readelf --sframe %t.o | FileCheck %s
+
+## Tests that .cfi_escape sequences that are ok to pass through work.
+
+        .align 1024
+cfi_escape_ok:
+        .cfi_startproc
+        .long 0
+        .cfi_def_cfa_offset 16
+        ## Uninteresting register
+## DW_CFA_expression,reg 0xc,length 2,DW_OP_breg6,SLEB(-8)
+        .cfi_escape 0x10,0xc,0x2,0x76,0x78
+## DW_CFA_nop
+        .cfi_escape 0x0
+        .cfi_escape 0x0,0x0,0x0,0x0
+        ## Uninteresting register
+## DW_CFA_val_offset,reg 0xc,ULEB scaled offset
+        .cfi_escape 0x14,0xc,0x4
+        .long 0
+        .cfi_endproc
+
+cfi_escape_gnu_args_fp:
+        .cfi_startproc
+        .long 0
+## DW_CFA_GNU_args_size is OK if arg size is zero
+        .cfi_escape 0x2e, 0x0
+        .long 0
+        .cfi_def_cfa_register 6
+        .long 0
+## DW_CFA_GNU_args_size is OK if cfa is FP
+        .cfi_escape 0x2e, 0x20
+        .cfi_endproc
+
+cfi_escape_long_expr:
+        .cfi_startproc
+        .long 0
+        .cfi_def_cfa_offset 16
+## This is a long, but valid, dwarf expression without sframe
+## implications. An FDE can still be created.
+## DW_CFA_val_offset,rcx,ULEB scaled offset(16), DW_CFA_expr,r10,length,DW_OP_deref,SLEB(-8)
+        .cfi_escape 0x14,0x2,0x2,0x10,0xa,0x2,0x76,0x78
+        .long 0
+        .cfi_endproc
+
+# CHECK: Num FDEs: 3
diff --git a/llvm/test/MC/Hexagon/arch-support.s b/llvm/test/MC/Hexagon/arch-support.s
index eb362a7..94a6eb1 100644
--- a/llvm/test/MC/Hexagon/arch-support.s
+++ b/llvm/test/MC/Hexagon/arch-support.s
@@ -10,6 +10,7 @@
 # RUN: llvm-mc -triple=hexagon -mv73 -filetype=obj %s | llvm-readelf -h - | FileCheck --check-prefix=CHECK-V73 %s
 # RUN: llvm-mc -triple=hexagon -mv75 -filetype=obj %s | llvm-readelf -h - | FileCheck --check-prefix=CHECK-V75 %s
 # RUN: llvm-mc -triple=hexagon -mv79 -filetype=obj %s | llvm-readelf -h - | FileCheck --check-prefix=CHECK-V79 %s
+# RUN: llvm-mc -triple=hexagon -mv81 -filetype=obj %s | llvm-readelf -h - | FileCheck --check-prefix=CHECK-V81 %s
 
 ## Check which arch version llvm-mc sets when the user does not provide one.
 # RUN: llvm-mc -triple=hexagon -filetype=obj %s | llvm-readelf -h - | FileCheck --check-prefix=CHECK-DEFAULT %s
@@ -26,6 +27,7 @@
 # RUN: llvm-mc -triple=hexagon -mv73 -filetype=obj %s | llvm-objdump --disassemble - | FileCheck --check-prefix=CHECK-OBJDUMP %s
 # RUN: llvm-mc -triple=hexagon -mv75 -filetype=obj %s | llvm-objdump --disassemble - | FileCheck --check-prefix=CHECK-OBJDUMP %s
 # RUN: llvm-mc -triple=hexagon -mv79 -filetype=obj %s | llvm-objdump --disassemble - | FileCheck --check-prefix=CHECK-OBJDUMP %s
+# RUN: llvm-mc -triple=hexagon -mv81 -filetype=obj %s | llvm-objdump --disassemble - | FileCheck --check-prefix=CHECK-OBJDUMP %s
     .text
 r1 = r1
 
@@ -41,6 +43,7 @@ r1 = r1
 # CHECK-V73: Flags:{{.*}}0x73
 # CHECK-V75: Flags:{{.*}}0x75
 # CHECK-V79: Flags:{{.*}}0x79
+# CHECK-V81: Flags:{{.*}}0x81
 # CHECK-DEFAULT: Flags:{{.*}}0x68
 
 # CHECK-OBJDUMP: { r1 = r1 }
diff --git a/llvm/test/MC/Hexagon/v81_arch.s b/llvm/test/MC/Hexagon/v81_arch.s
new file mode 100644
index 0000000..0cd5d6b
--- /dev/null
+++ b/llvm/test/MC/Hexagon/v81_arch.s
@@ -0,0 +1,10 @@
+# RUN: llvm-mc -arch=hexagon -mcpu=hexagonv81 -filetype=obj %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -arch=hexagon -mcpu=hexagonv81 -mhvx -filetype=obj %s | llvm-objdump -d - | FileCheck %s
+
+r1=memw(r0)
+{ r0=r0
+  memw(r0)=r0.new }
+
+# CHECK: { r1 = memw(r0+#0x0) }
+# CHECK: { r0 = r0
+# CHECK:   memw(r0+#0x0) = r0.new }
diff --git a/llvm/test/MC/PowerPC/ppc64-encoding-ext.s b/llvm/test/MC/PowerPC/ppc64-encoding-ext.s
index 959f3c5..6662220 100644
--- a/llvm/test/MC/PowerPC/ppc64-encoding-ext.s
+++ b/llvm/test/MC/PowerPC/ppc64-encoding-ext.s
@@ -3491,12 +3491,18 @@
 # CHECK-BE: mfamr 2                         # encoding: [0x7c,0x5d,0x02,0xa6]
 # CHECK-LE: mfamr 2                         # encoding: [0xa6,0x02,0x5d,0x7c]
             mfamr 2
-# CHECK-BE: mtpid 2                         # encoding: [0x7c,0x50,0x0b,0xa6]
-# CHECK-LE: mtpid 2                         # encoding: [0xa6,0x0b,0x50,0x7c]
+# CHECK-BE: mtspr 48, 2                     # encoding: [0x7c,0x50,0x0b,0xa6]
+# CHECK-LE: mtspr 48, 2                     # encoding: [0xa6,0x0b,0x50,0x7c]
             mtpid 2
-# CHECK-BE: mfpid 2                         # encoding: [0x7c,0x50,0x0a,0xa6]
-# CHECK-LE: mfpid 2                         # encoding: [0xa6,0x0a,0x50,0x7c]
+# CHECK-BE: mtspr 48, 2                     # encoding: [0x7c,0x50,0x0b,0xa6]
+# CHECK-LE: mtspr 48, 2                     # encoding: [0xa6,0x0b,0x50,0x7c]
+            mtpidr 2
+# CHECK-BE: mfspr 2, 48                     # encoding: [0x7c,0x50,0x0a,0xa6]
+# CHECK-LE: mfspr 2, 48                     # encoding: [0xa6,0x0a,0x50,0x7c]
             mfpid 2
+# CHECK-BE: mfspr 2, 48                     # encoding: [0x7c,0x50,0x0a,0xa6]
+# CHECK-LE: mfspr 2, 48                     # encoding: [0xa6,0x0a,0x50,0x7c]
+            mfpidr 2
 # CHECK-BE: mtlr 2                          # encoding: [0x7c,0x48,0x03,0xa6]
 # CHECK-LE: mtlr 2                          # encoding: [0xa6,0x03,0x48,0x7c]
             mtlr 2
diff --git a/llvm/test/MC/Xtensa/debug.s b/llvm/test/MC/Xtensa/debug.s
index 36b1f11..4ca6368 100644
--- a/llvm/test/MC/Xtensa/debug.s
+++ b/llvm/test/MC/Xtensa/debug.s
@@ -11,7 +11,7 @@ break 1, 1
 
 # Instruction format RRRN
 # CHECK-INST: break.n 1
-# CHECK: encoding: [0x2c,0xf1]
+# CHECK: encoding: [0x2d,0xf1]
 break.n 1
 
 # Instruction format RRR
diff --git a/llvm/test/MachineVerifier/test_g_shuffle_vector.mir b/llvm/test/MachineVerifier/test_g_shuffle_vector.mir
index 6aba6731..c4ca2d2 100644
--- a/llvm/test/MachineVerifier/test_g_shuffle_vector.mir
+++ b/llvm/test/MachineVerifier/test_g_shuffle_vector.mir
@@ -50,10 +50,16 @@ body:             |
     %20:_(<2 x s32>) = G_SHUFFLE_VECTOR %3, %19, shufflemask(1, 0)
 
     ; CHECK: Bad machine code: G_SHUFFLE_VECTOR cannot change element type
-    %21:_(s16) = G_SHUFFLE_VECTOR %3, %4, shufflemask(0)
+    %21:_(s16) = G_SHUFFLE_VECTOR %3, %4, shufflemask(0, 1)
+
+    ; CHECK: Bad machine code: G_SHUFFLE_VECTOR must have vector src
+    %22:_(<2 x s32>) = G_SHUFFLE_VECTOR %3, %4, shufflemask(0, 0)
+
+    %23:_(p0) = G_IMPLICIT_DEF
+    ; CHECK: Bad machine code: G_SHUFFLE_VECTOR must have vector src
+    %24:_(<2 x p0>) = G_SHUFFLE_VECTOR %23, %23, shufflemask(0, 0)
 
     ; CHECK: Bad machine code: Out of bounds shuffle index
-    %22:_(s32) = G_IMPLICIT_DEF
-    %20:_(<2 x s32>) = G_SHUFFLE_VECTOR %22, %22, shufflemask(0, 2)
+    %26:_(<2 x s32>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(0, 7)
 
 ...
diff --git a/llvm/test/TableGen/intrinsic-manual-name.td b/llvm/test/TableGen/intrinsic-manual-name.td
new file mode 100644
index 0000000..5751fc2
--- /dev/null
+++ b/llvm/test/TableGen/intrinsic-manual-name.td
@@ -0,0 +1,6 @@
+// RUN: llvm-tblgen -gen-intrinsic-impl -I %p/../../include %s -DTEST_INTRINSICS_SUPPRESS_DEFS 2>&1 | FileCheck %s -DFILE=%s
+
+include "llvm/IR/Intrinsics.td"
+
+// CHECK: [[FILE]]:[[@LINE+1]]:5: note: Explicitly specified name matches default name, consider dropping it
+def int_foo0 : Intrinsic<[llvm_anyint_ty], [], [], "llvm.foo0">;
diff --git a/llvm/test/ThinLTO/X86/devirt_external_comdat_same_guid.ll b/llvm/test/ThinLTO/X86/devirt_external_comdat_same_guid.ll
index 1f0737b..d0b8d14 100644
--- a/llvm/test/ThinLTO/X86/devirt_external_comdat_same_guid.ll
+++ b/llvm/test/ThinLTO/X86/devirt_external_comdat_same_guid.ll
@@ -24,9 +24,9 @@
 ; RUN: llvm-dis %t5.1.4.opt.bc -o - | FileCheck %s --check-prefix=CHECK-IR1
 ; RUN: llvm-dis %t5.2.4.opt.bc -o - | FileCheck %s --check-prefix=CHECK-IR2
 
-; PRINT-DAG: Devirtualized call to {{.*}} (_ZN1A1nEi)
+; PRINT-DAG: Devirtualized call to {{.*}} (_ZN1B1nEi)
 
-; REMARK-DAG: single-impl: devirtualized a call to _ZN1A1nEi
+; REMARK-DAG: single-impl: devirtualized a call to _ZN1B1nEi
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-grtev4-linux-gnu"
@@ -55,7 +55,7 @@ entry:
   ret i32 0
 }
 
-; CHECK-IR1: define i32 @test(
+; CHECK-IR1: define noundef i32 @test(
 define i32 @test(ptr %obj, i32 %a) {
 entry:
   %vtable = load ptr, ptr %obj
@@ -65,7 +65,7 @@ entry:
   %fptr1 = load ptr, ptr %fptrptr, align 8
 
   ; Check that the call was devirtualized.
-  ; CHECK-IR1: tail call i32 {{.*}}@_ZN1A1nEi
+  ; CHECK-IR1: tail call i32 {{.*}}@_ZN1B1nEi
   %call = tail call i32 %fptr1(ptr nonnull %obj, i32 %a)
 
   ret i32 %call
@@ -73,7 +73,7 @@ entry:
 
 ; CHECK-IR2: define i32 @test2
 ; Check that the call was devirtualized.
-; CHECK-IR2:   tail call i32 @_ZN1A1nEi
+; CHECK-IR2:   tail call i32 @_ZN1B1nEi
 
 declare i1 @llvm.type.test(ptr, metadata)
 declare void @llvm.assume(i1)
diff --git a/llvm/test/ThinLTO/X86/dtlto/json.ll b/llvm/test/ThinLTO/X86/dtlto/json.ll
index 1a38438..ee1c428 100644
--- a/llvm/test/ThinLTO/X86/dtlto/json.ll
+++ b/llvm/test/ThinLTO/X86/dtlto/json.ll
@@ -7,21 +7,33 @@ RUN: rm -rf %t && split-file %s %t && cd %t
 RUN: opt -thinlto-bc t1.ll -o t1.bc
 RUN: opt -thinlto-bc t2.ll -o t2.bc
 
-; Perform DTLTO.
+; Perform DTLTO with clang.
 RUN: not llvm-lto2 run t1.bc t2.bc -o my.output \
 RUN:     -r=t1.bc,t1,px -r=t2.bc,t2,px \
 RUN:     -dtlto-distributor=%python \
 RUN:     -dtlto-distributor-arg=%llvm_src_root/utils/dtlto/validate.py,--da1=10,--da2=10 \
 RUN:     -dtlto-compiler=my_clang.exe \
 RUN:     -dtlto-compiler-arg=--rota1=10,--rota2=20 \
-RUN:   2>&1 | FileCheck %s
+RUN:   2>&1 | FileCheck --check-prefixes=CHECK,CLANG %s
+
+; Perform DTLTO with LLVM driver.
+RUN: not llvm-lto2 run t1.bc t2.bc -o my.output \
+RUN:     -r=t1.bc,t1,px -r=t2.bc,t2,px \
+RUN:     -dtlto-distributor=%python \
+RUN:     -dtlto-distributor-arg=%llvm_src_root/utils/dtlto/validate.py,--da1=10,--da2=10 \
+RUN:     -dtlto-compiler=llvm \
+RUN:     -dtlto-compiler-prepend-arg=clang \
+RUN:     -dtlto-compiler-arg=--rota1=10,--rota2=20 \
+RUN:   2>&1 | FileCheck --check-prefixes=CHECK,LLVM %s
 
 CHECK: distributor_args=['--da1=10', '--da2=10']
 
 ; Check the common object.
 CHECK:      "linker_output": "my.output"
 CHECK:      "args":
-CHECK-NEXT: "my_clang.exe"
+CLANG-NEXT: "my_clang.exe"
+LLVM-NEXT:  "llvm"
+LLVM-NEXT:  "clang"
 CHECK-NEXT: "-c"
 CHECK-NEXT: "--target=x86_64-unknown-linux-gnu"
 CHECK-NEXT: "-O2"
@@ -30,7 +42,7 @@ CHECK-NEXT: "-Wno-unused-command-line-argument"
 CHECK-NEXT: "--rota1=10"
 CHECK-NEXT: "--rota2=20"
 CHECK-NEXT: ]
-CHECK: "inputs": []
+CHECK:      "inputs": []
 
 ; Check the first job entry.
 CHECK:      "args":
diff --git a/llvm/test/Transforms/ADCE/2016-09-06.ll b/llvm/test/Transforms/ADCE/2016-09-06.ll
index 850f412..1329ac6 100644
--- a/llvm/test/Transforms/ADCE/2016-09-06.ll
+++ b/llvm/test/Transforms/ADCE/2016-09-06.ll
@@ -5,7 +5,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 ; Function Attrs: nounwind uwtable
-define i32 @foo(i32, i32, i32) #0 {
+define i32 @foo(i32, i32, i32) {
   %4 = alloca i32, align 4
   %5 = alloca i32, align 4
   %6 = alloca i32, align 4
@@ -48,8 +48,6 @@ B21:
   ret i32 %I22
 }
 
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 4.0.0"}
diff --git a/llvm/test/Transforms/ADCE/blocks-with-dead-term-nondeterministic.ll b/llvm/test/Transforms/ADCE/blocks-with-dead-term-nondeterministic.ll
index 9708be9..5e844b4 100644
--- a/llvm/test/Transforms/ADCE/blocks-with-dead-term-nondeterministic.ll
+++ b/llvm/test/Transforms/ADCE/blocks-with-dead-term-nondeterministic.ll
@@ -5,7 +5,7 @@ target triple = "x86_64-apple-macosx10.10.0"
 
 ; CHECK: uselistorder label %bb16, { 1, 0 }
 ; Function Attrs: noinline nounwind ssp uwtable
-define void @ham(i1 %arg) local_unnamed_addr #0 {
+define void @ham(i1 %arg) local_unnamed_addr {
 bb:
   br i1 false, label %bb1, label %bb22
 
@@ -64,8 +64,6 @@ bb22:                                             ; preds = %bb21, %bb
   ret void
 }
 
-attributes #0 = { noinline nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0}
 
 !0 = !{i32 7, !"PIC Level", i32 2}
diff --git a/llvm/test/Transforms/AddDiscriminators/basic.ll b/llvm/test/Transforms/AddDiscriminators/basic.ll
index 5186537..fc4c10a 100644
--- a/llvm/test/Transforms/AddDiscriminators/basic.ll
+++ b/llvm/test/Transforms/AddDiscriminators/basic.ll
@@ -11,7 +11,7 @@
 ;         if (i < 10) x = i;
 ;       }
 
-define void @foo(i32 %i) #0 !dbg !4 {
+define void @foo(i32 %i) !dbg !4 {
 entry:
   %i.addr = alloca i32, align 4
   %x = alloca i32, align 4
@@ -35,8 +35,6 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK:   ret void, !dbg ![[END:[0-9]+]]
 }
 
-attributes #0 = { nounwind uwtable noinline optnone "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
diff --git a/llvm/test/Transforms/AddDiscriminators/call-nested.ll b/llvm/test/Transforms/AddDiscriminators/call-nested.ll
index 99340a5..f1373e4 100644
--- a/llvm/test/Transforms/AddDiscriminators/call-nested.ll
+++ b/llvm/test/Transforms/AddDiscriminators/call-nested.ll
@@ -9,7 +9,7 @@
 ; #6 }
 
 ; Function Attrs: uwtable
-define i32 @_Z3bazv() #0 !dbg !4 {
+define i32 @_Z3bazv() !dbg !4 {
   %1 = call i32 @_Z3barv(), !dbg !11
 ; CHECK: %1 = call i32 @_Z3barv(), !dbg ![[CALL0:[0-9]+]]
   %2 = call i32 @_Z3barv(), !dbg !12
@@ -19,12 +19,9 @@ define i32 @_Z3bazv() #0 !dbg !4 {
   ret i32 %3, !dbg !14
 }
 
-declare i32 @_Z3fooii(i32, i32) #1
+declare i32 @_Z3fooii(i32, i32)
 
-declare i32 @_Z3barv() #1
-
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+declare i32 @_Z3barv()
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !9}
diff --git a/llvm/test/Transforms/AddDiscriminators/call.ll b/llvm/test/Transforms/AddDiscriminators/call.ll
index 93d3aa4..11b21ef 100644
--- a/llvm/test/Transforms/AddDiscriminators/call.ll
+++ b/llvm/test/Transforms/AddDiscriminators/call.ll
@@ -8,7 +8,7 @@
 ; #5 }
 
 ; Function Attrs: uwtable
-define void @_Z3foov() #0 !dbg !4 {
+define void @_Z3foov() !dbg !4 {
   call void @_Z3barv(), !dbg !10
 ; CHECK:  call void @_Z3barv(), !dbg ![[CALL0:[0-9]+]]
   %a = alloca [100 x i8], align 16
@@ -21,13 +21,10 @@ define void @_Z3foov() #0 !dbg !4 {
   ret void, !dbg !13
 }
 
-declare void @_Z3barv() #1
+declare void @_Z3barv()
 declare void @llvm.lifetime.start.p0(ptr nocapture) nounwind argmemonly
 declare void @llvm.lifetime.end.p0(ptr nocapture) nounwind argmemonly
 
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
diff --git a/llvm/test/Transforms/AddDiscriminators/diamond.ll b/llvm/test/Transforms/AddDiscriminators/diamond.ll
index c93a57a..9edcf39 100644
--- a/llvm/test/Transforms/AddDiscriminators/diamond.ll
+++ b/llvm/test/Transforms/AddDiscriminators/diamond.ll
@@ -12,7 +12,7 @@
 ; bar(3):     discriminator 2
 
 ; Function Attrs: uwtable
-define void @_Z3fooi(i32 %i) #0 !dbg !4 {
+define void @_Z3fooi(i32 %i) !dbg !4 {
   %1 = alloca i32, align 4
   store i32 %i, ptr %1, align 4
   call void @llvm.dbg.declare(metadata ptr %1, metadata !11, metadata !12), !dbg !13
@@ -34,13 +34,9 @@ define void @_Z3fooi(i32 %i) #0 !dbg !4 {
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
-declare void @_Z3bari(i32) #2
-
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+declare void @_Z3bari(i32)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !9}
diff --git a/llvm/test/Transforms/AddDiscriminators/first-only.ll b/llvm/test/Transforms/AddDiscriminators/first-only.ll
index 7ae9ed0..415e5f0 100644
--- a/llvm/test/Transforms/AddDiscriminators/first-only.ll
+++ b/llvm/test/Transforms/AddDiscriminators/first-only.ll
@@ -13,7 +13,7 @@
 ;         }
 ;       }
 
-define void @foo(i32 %i) #0 !dbg !4 {
+define void @foo(i32 %i) !dbg !4 {
 entry:
   %i.addr = alloca i32, align 4
   %x = alloca i32, align 4
@@ -44,8 +44,6 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK:  ret void, !dbg ![[END:[0-9]+]]
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
diff --git a/llvm/test/Transforms/AddDiscriminators/invoke.ll b/llvm/test/Transforms/AddDiscriminators/invoke.ll
index d39014d..a3989b6 100644
--- a/llvm/test/Transforms/AddDiscriminators/invoke.ll
+++ b/llvm/test/Transforms/AddDiscriminators/invoke.ll
@@ -5,14 +5,14 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.14.0"
 
 ; Function Attrs: ssp uwtable
-define void @_Z3foov() #0 personality ptr @__gxx_personality_v0 !dbg !8 {
+define void @_Z3foov() personality ptr @__gxx_personality_v0 !dbg !8 {
 entry:
   %exn.slot = alloca ptr
   %ehselector.slot = alloca i32
   ; CHECK: call void @_Z12bar_noexceptv({{.*}} !dbg ![[CALL1:[0-9]+]]
-  call void @_Z12bar_noexceptv() #4, !dbg !11
+  call void @_Z12bar_noexceptv(), !dbg !11
   ; CHECK: call void @_Z12bar_noexceptv({{.*}} !dbg ![[CALL2:[0-9]+]]
-  call void @_Z12bar_noexceptv() #4, !dbg !13
+  call void @_Z12bar_noexceptv(), !dbg !13
   invoke void @_Z3barv()
   ; CHECK: unwind label {{.*}} !dbg ![[INVOKE:[0-9]+]]
           to label %invoke.cont unwind label %lpad, !dbg !14
@@ -31,8 +31,8 @@ lpad:                                             ; preds = %entry
 
 catch:                                            ; preds = %lpad
   %exn = load ptr, ptr %exn.slot, align 8, !dbg !15
-  %3 = call ptr @__cxa_begin_catch(ptr %exn) #4, !dbg !15
-  invoke void @__cxa_rethrow() #5
+  %3 = call ptr @__cxa_begin_catch(ptr %exn), !dbg !15
+  invoke void @__cxa_rethrow()
           to label %unreachable unwind label %lpad1, !dbg !17
 
 lpad1:                                            ; preds = %catch
@@ -62,7 +62,7 @@ terminate.lpad:                                   ; preds = %lpad1
   %7 = landingpad { ptr, i32 }
           catch ptr null, !dbg !20
   %8 = extractvalue { ptr, i32 } %7, 0, !dbg !20
-  call void @__clang_call_terminate(ptr %8) #6, !dbg !20
+  call void @__clang_call_terminate(ptr %8), !dbg !20
   unreachable, !dbg !20
 
 unreachable:                                      ; preds = %catch
@@ -70,9 +70,9 @@ unreachable:                                      ; preds = %catch
 }
 
 ; Function Attrs: nounwind
-declare void @_Z12bar_noexceptv() #1
+declare void @_Z12bar_noexceptv()
 
-declare void @_Z3barv() #2
+declare void @_Z3barv()
 
 declare i32 @__gxx_personality_v0(...)
 
@@ -83,22 +83,14 @@ declare void @__cxa_rethrow()
 declare void @__cxa_end_catch()
 
 ; Function Attrs: noinline noreturn nounwind
-define linkonce_odr hidden void @__clang_call_terminate(ptr) #3 {
-  %2 = call ptr @__cxa_begin_catch(ptr %0) #4
-  call void @_ZSt9terminatev() #6
+define linkonce_odr hidden void @__clang_call_terminate(ptr) {
+  %2 = call ptr @__cxa_begin_catch(ptr %0)
+  call void @_ZSt9terminatev()
   unreachable
 }
 
 declare void @_ZSt9terminatev()
 
-attributes #0 = { ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { noinline noreturn nounwind }
-attributes #4 = { nounwind }
-attributes #5 = { noreturn }
-attributes #6 = { noreturn nounwind }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5, !6}
 !llvm.ident = !{!7}
diff --git a/llvm/test/Transforms/AddDiscriminators/multiple.ll b/llvm/test/Transforms/AddDiscriminators/multiple.ll
index 54c1a5d..8e8ca6a 100644
--- a/llvm/test/Transforms/AddDiscriminators/multiple.ll
+++ b/llvm/test/Transforms/AddDiscriminators/multiple.ll
@@ -10,7 +10,7 @@
 ; The two stores inside the if-then-else line must have different discriminator
 ; values.
 
-define void @foo(i32 %i) #0 !dbg !4 {
+define void @foo(i32 %i) !dbg !4 {
 entry:
   %i.addr = alloca i32, align 4
   %x = alloca i32, align 4
@@ -45,8 +45,6 @@ if.end:                                           ; preds = %if.else, %if.then
   ret void, !dbg !12
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
diff --git a/llvm/test/Transforms/AddDiscriminators/no-discriminators.ll b/llvm/test/Transforms/AddDiscriminators/no-discriminators.ll
index c23edd6..f84579b 100644
--- a/llvm/test/Transforms/AddDiscriminators/no-discriminators.ll
+++ b/llvm/test/Transforms/AddDiscriminators/no-discriminators.ll
@@ -12,7 +12,7 @@
 ; altered. If they are, it means that the discriminators pass added a
 ; new lexical scope.
 
-define i32 @foo(i64 %i) #0 !dbg !4 {
+define i32 @foo(i64 %i) !dbg !4 {
 entry:
   %retval = alloca i32, align 4
   %i.addr = alloca i64, align 8
@@ -39,10 +39,7 @@ return:                                           ; preds = %if.else, %if.then
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; We should be able to add discriminators even in the absence of llvm.dbg.cu.
 ; When using sample profiles, the front end will generate line tables but it
diff --git a/llvm/test/Transforms/AddDiscriminators/oneline.ll b/llvm/test/Transforms/AddDiscriminators/oneline.ll
index 533d547..fc1675b 100644
--- a/llvm/test/Transforms/AddDiscriminators/oneline.ll
+++ b/llvm/test/Transforms/AddDiscriminators/oneline.ll
@@ -10,7 +10,7 @@
 ; return 100: discriminator 4
 ; return 99:  discriminator 6
 
-define i32 @_Z3fooi(i32 %i) #0 !dbg !4 {
+define i32 @_Z3fooi(i32 %i) !dbg !4 {
   %1 = alloca i32, align 4
   %2 = alloca i32, align 4
   store i32 %i, ptr %2, align 4, !tbaa !13
@@ -49,10 +49,7 @@ define i32 @_Z3fooi(i32 %i) #0 !dbg !4 {
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!10, !11}
diff --git a/llvm/test/Transforms/Attributor/reduced/register_benchmark_test.ll b/llvm/test/Transforms/Attributor/reduced/register_benchmark_test.ll
index eb7d78f..4704238 100644
--- a/llvm/test/Transforms/Attributor/reduced/register_benchmark_test.ll
+++ b/llvm/test/Transforms/Attributor/reduced/register_benchmark_test.ll
@@ -1557,24 +1557,24 @@ declare dso_local void @_GLOBAL__sub_I_register_benchmark_test.cc() #0 section "
 ; Function Attrs: cold noreturn nounwind
 declare void @llvm.trap() #20
 
-attributes #0 = { uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #3 = { nounwind }
-attributes #4 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #5 = { argmemonly nounwind willreturn }
-attributes #6 = { alwaysinline uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #7 = { alwaysinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #8 = { nobuiltin "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #9 = { nobuiltin nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #10 = { inlinehint uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #11 = { inlinehint nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #12 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #13 = { norecurse uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { alwaysinline uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #7 = { alwaysinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #8 = { nobuiltin "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #9 = { nobuiltin nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #10 = { inlinehint uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #11 = { inlinehint nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #12 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #13 = { norecurse uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #14 = { nounwind readnone willreturn }
-attributes #15 = { noreturn "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #15 = { noreturn "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #16 = { noinline noreturn nounwind }
 attributes #17 = { argmemonly nounwind willreturn writeonly }
-attributes #18 = { noreturn uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #19 = { inlinehint noreturn uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #18 = { noreturn uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #19 = { inlinehint noreturn uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #20 = { cold noreturn nounwind }
diff --git a/llvm/test/Transforms/CodeGenPrepare/ARM/bitreverse-recognize.ll b/llvm/test/Transforms/CodeGenPrepare/ARM/bitreverse-recognize.ll
index d272fef..189186b 100644
--- a/llvm/test/Transforms/CodeGenPrepare/ARM/bitreverse-recognize.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/ARM/bitreverse-recognize.ll
@@ -4,7 +4,7 @@ target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "armv7--linux-gnueabihf"
 
 ; CHECK-LABEL: @f
-define i32 @f(i32 %a) #0 {
+define i32 @f(i32 %a) {
 ; CHECK: call i32 @llvm.bitreverse.i32
 entry:
   br label %for.body
@@ -25,8 +25,6 @@ for.body:                                         ; preds = %for.body, %entry
   br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !3
 }
 
-attributes #0 = { norecurse nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0, !1}
 !llvm.ident = !{!2}
 
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/bitreverse-hang.ll b/llvm/test/Transforms/CodeGenPrepare/X86/bitreverse-hang.ll
index eec0967..35115cf 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/bitreverse-hang.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/bitreverse-hang.ll
@@ -21,7 +21,7 @@
 @b = common global i32 0, align 4
 
 ; CHECK: define i32 @fn1
-define i32 @fn1() #0 {
+define i32 @fn1() {
 entry:
   %b.promoted = load i32, ptr @b, align 4, !tbaa !2
   br label %for.body
@@ -40,8 +40,6 @@ for.end:                                          ; preds = %for.body
   ret i32 undef
 }
 
-attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
 
diff --git a/llvm/test/Transforms/CodeGenPrepare/dom-tree.ll b/llvm/test/Transforms/CodeGenPrepare/dom-tree.ll
index 1c990ff..14360fe 100644
--- a/llvm/test/Transforms/CodeGenPrepare/dom-tree.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/dom-tree.ll
@@ -10,7 +10,7 @@
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "armv7--linux-gnueabihf"
 
-define i32 @f(i32 %a) #0 {
+define i32 @f(i32 %a) {
 entry:
   br label %for.body
 
@@ -30,8 +30,6 @@ for.body:
   br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !3
 }
 
-attributes #0 = { norecurse nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0, !1}
 !llvm.ident = !{!2}
 
diff --git a/llvm/test/Transforms/ConstantHoisting/X86/ehpad.ll b/llvm/test/Transforms/ConstantHoisting/X86/ehpad.ll
index 46fa066..78760db 100644
--- a/llvm/test/Transforms/ConstantHoisting/X86/ehpad.ll
+++ b/llvm/test/Transforms/ConstantHoisting/X86/ehpad.ll
@@ -20,7 +20,7 @@ target triple = "x86_64-pc-windows-msvc"
 ; BFIHOIST: br label %endif
 
 ; Function Attrs: norecurse
-define i32 @main(i32 %argc, ptr nocapture readnone %argv) local_unnamed_addr #0 personality ptr @__CxxFrameHandler3 {
+define i32 @main(i32 %argc, ptr nocapture readnone %argv) local_unnamed_addr personality ptr @__CxxFrameHandler3 {
   %call = tail call i64 @fn(i64 0)
   %call1 = tail call i64 @fn(i64 1)
   %tobool = icmp eq i32 %argc, 0
@@ -62,9 +62,6 @@ endif:
   ret i32 0
 }
 
-declare i64 @fn(i64) local_unnamed_addr #1
+declare i64 @fn(i64) local_unnamed_addr
 
 declare i32 @__CxxFrameHandler3(...)
-
-attributes #0 = { norecurse "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/ConstraintElimination/add-nsw.ll b/llvm/test/Transforms/ConstraintElimination/add-nsw.ll
index 5127e92..4b8ac09 100644
--- a/llvm/test/Transforms/ConstraintElimination/add-nsw.ll
+++ b/llvm/test/Transforms/ConstraintElimination/add-nsw.ll
@@ -757,8 +757,7 @@ define i1 @add_neg_1_known_sge_ult_1(i32 %a) {
 ; CHECK-NEXT:    [[A_SGE:%.*]] = icmp sge i32 [[A:%.*]], 1
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[A_SGE]])
 ; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[A]], -1
-; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[SUB]], [[A]]
-; CHECK-NEXT:    ret i1 [[C]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %a.sge = icmp sge i32 %a, 1
@@ -823,8 +822,7 @@ define i1 @add_neg_3_known_sge_ult_1(i32 %a) {
 ; CHECK-NEXT:    [[A_SGE:%.*]] = icmp sge i32 [[A:%.*]], 3
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[A_SGE]])
 ; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[A]], -3
-; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[SUB]], [[A]]
-; CHECK-NEXT:    ret i1 [[C]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %a.sge = icmp sge i32 %a, 3
diff --git a/llvm/test/Transforms/ConstraintElimination/gep-arithmetic-add.ll b/llvm/test/Transforms/ConstraintElimination/gep-arithmetic-add.ll
index 52adc78..8dcac78 100644
--- a/llvm/test/Transforms/ConstraintElimination/gep-arithmetic-add.ll
+++ b/llvm/test/Transforms/ConstraintElimination/gep-arithmetic-add.ll
@@ -389,8 +389,7 @@ define i1 @gep_count_add_1_sge_known_ult_1(i32 %count, ptr %p) {
 ; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[COUNT]], -1
 ; CHECK-NEXT:    [[SUB_EXT:%.*]] = zext i32 [[SUB]] to i64
 ; CHECK-NEXT:    [[GEP_SUB:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[SUB_EXT]]
-; CHECK-NEXT:    [[C:%.*]] = icmp ult ptr [[GEP_SUB]], [[GEP_COUNT]]
-; CHECK-NEXT:    ret i1 [[C]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %sge = icmp sge i32 %count, 1
@@ -415,8 +414,7 @@ define i1 @gep_count_add_1_sge_known_uge_1(i32 %count, ptr %p) {
 ; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[COUNT]], -1
 ; CHECK-NEXT:    [[SUB_EXT:%.*]] = zext i32 [[SUB]] to i64
 ; CHECK-NEXT:    [[GEP_SUB:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[SUB_EXT]]
-; CHECK-NEXT:    [[C:%.*]] = icmp uge ptr [[GEP_SUB]], [[P]]
-; CHECK-NEXT:    ret i1 [[C]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %sge = icmp sge i32 %count, 1
diff --git a/llvm/test/Transforms/Coroutines/coro-debug.ll b/llvm/test/Transforms/Coroutines/coro-debug.ll
index d1f1922..109be51f 100644
--- a/llvm/test/Transforms/Coroutines/coro-debug.ll
+++ b/llvm/test/Transforms/Coroutines/coro-debug.ll
@@ -14,7 +14,7 @@ entry:
   %0 = call token @llvm.coro.id(i32 0, ptr null, ptr @flink, ptr null), !dbg !16
   %1 = call i64 @llvm.coro.size.i64(), !dbg !16
   %call = call ptr @malloc(i64 %1), !dbg !16
-  %2 = call ptr @llvm.coro.begin(token %0, ptr %call) #7, !dbg !16
+  %2 = call ptr @llvm.coro.begin(token %0, ptr %call), !dbg !16
   store ptr %2, ptr %coro_hdl, align 8, !dbg !16
   %3 = call i8 @llvm.coro.suspend(token none, i1 false), !dbg !17
   %conv = sext i8 %3 to i32, !dbg !17
@@ -69,7 +69,7 @@ coro_Cleanup:                                     ; preds = %sw.epilog, %sw.bb1
   br label %coro_Suspend, !dbg !24
 
 coro_Suspend:                                     ; preds = %coro_Cleanup, %sw.default
-  call void @llvm.coro.end(ptr null, i1 false, token none) #7, !dbg !24
+  call void @llvm.coro.end(ptr null, i1 false, token none), !dbg !24
   %7 = load ptr, ptr %coro_hdl, align 8, !dbg !24
   store i32 0, ptr %late_local, !dbg !24
   ret ptr %7, !dbg !24
@@ -82,47 +82,40 @@ ehcleanup:
 }
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: argmemonly nounwind readonly
-declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #2
+declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr)
 
-declare ptr @malloc(i64) #3
+declare ptr @malloc(i64)
 declare ptr @allocate() 
 declare void @print({ ptr, i32 })
 declare void @log()
 
 ; Function Attrs: nounwind readnone
-declare i64 @llvm.coro.size.i64() #4
+declare i64 @llvm.coro.size.i64()
 
 ; Function Attrs: nounwind
-declare ptr @llvm.coro.begin(token, ptr writeonly) #5
+declare ptr @llvm.coro.begin(token, ptr writeonly)
 
 ; Function Attrs: nounwind
-declare i8 @llvm.coro.suspend(token, i1) #5
+declare i8 @llvm.coro.suspend(token, i1)
 
-declare void @free(ptr) #3
+declare void @free(ptr)
 
 ; Function Attrs: argmemonly nounwind readonly
-declare ptr @llvm.coro.free(token, ptr nocapture readonly) #2
+declare ptr @llvm.coro.free(token, ptr nocapture readonly)
 
 ; Function Attrs: nounwind
-declare void @llvm.coro.end(ptr, i1, token) #5
+declare void @llvm.coro.end(ptr, i1, token)
 
 ; Function Attrs: argmemonly nounwind readonly
-declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #2
-
-attributes #0 = { noinline nounwind presplitcoroutine "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
-attributes #2 = { argmemonly nounwind readonly }
-attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind readnone }
-attributes #5 = { nounwind }
-attributes #6 = { alwaysinline }
-attributes #7 = { noduplicate }
+declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8)
+
+attributes #0 = { noinline nounwind presplitcoroutine }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
diff --git a/llvm/test/Transforms/Coroutines/coro-split-dbg.ll b/llvm/test/Transforms/Coroutines/coro-split-dbg.ll
index c53bea8..577ca9a 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-dbg.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-dbg.ll
@@ -6,9 +6,9 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
-declare void @bar(...) local_unnamed_addr #2
+declare void @bar(...) local_unnamed_addr
 
 ; Function Attrs: nounwind uwtable
 define ptr @f() #3 !dbg !16 {
@@ -16,14 +16,14 @@ entry:
   %0 = tail call token @llvm.coro.id(i32 0, ptr null, ptr @f, ptr null), !dbg !26
   %1 = tail call i64 @llvm.coro.size.i64(), !dbg !26
   %call = tail call ptr @malloc(i64 %1), !dbg !26
-  %2 = tail call ptr @llvm.coro.begin(token %0, ptr %call) #9, !dbg !26
+  %2 = tail call ptr @llvm.coro.begin(token %0, ptr %call), !dbg !26
   tail call void @llvm.dbg.value(metadata ptr %2, metadata !21, metadata !12), !dbg !26
   br label %for.cond, !dbg !27
 
 for.cond:                                         ; preds = %for.cond, %entry
   tail call void @llvm.dbg.value(metadata i32 undef, metadata !22, metadata !12), !dbg !28
-  tail call void @llvm.dbg.value(metadata i32 undef, metadata !11, metadata !12) #7, !dbg !29
-  tail call void (...) @bar() #7, !dbg !33
+  tail call void @llvm.dbg.value(metadata i32 undef, metadata !11, metadata !12), !dbg !29
+  tail call void (...) @bar(), !dbg !33
   %3 = tail call token @llvm.coro.save(ptr null), !dbg !34
   %4 = tail call i8 @llvm.coro.suspend(token %3, i1 false), !dbg !34
   %conv = sext i8 %4 to i32, !dbg !34
@@ -38,40 +38,31 @@ coro_Cleanup:                                     ; preds = %for.cond
   br label %coro_Suspend, !dbg !36
 
 coro_Suspend:                                     ; preds = %for.cond, %if.then, %coro_Cleanup
-  tail call void @llvm.coro.end(ptr null, i1 false, token none) #9, !dbg !38
+  tail call void @llvm.coro.end(ptr null, i1 false, token none), !dbg !38
   ret ptr %2, !dbg !39
 }
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(ptr nocapture) #4
+declare void @llvm.lifetime.start.p0(ptr nocapture)
 
 ; Function Attrs: argmemonly nounwind readonly
-declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #5
+declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr)
 
 ; Function Attrs: nounwind
-declare noalias ptr @malloc(i64) local_unnamed_addr #6
-declare i64 @llvm.coro.size.i64() #1
-declare ptr @llvm.coro.begin(token, ptr writeonly) #7
-declare token @llvm.coro.save(ptr) #7
-declare i8 @llvm.coro.suspend(token, i1) #7
-declare void @llvm.lifetime.end.p0(ptr nocapture) #4
-declare ptr @llvm.coro.free(token, ptr nocapture readonly) #5
-declare void @free(ptr nocapture) local_unnamed_addr #6
-declare void @llvm.coro.end(ptr, i1, token) #7
-declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #5
+declare noalias ptr @malloc(i64) local_unnamed_addr
+declare i64 @llvm.coro.size.i64()
+declare ptr @llvm.coro.begin(token, ptr writeonly)
+declare token @llvm.coro.save(ptr)
+declare i8 @llvm.coro.suspend(token, i1)
+declare void @llvm.lifetime.end.p0(ptr nocapture)
+declare ptr @llvm.coro.free(token, ptr nocapture readonly)
+declare void @free(ptr nocapture) local_unnamed_addr
+declare void @llvm.coro.end(ptr, i1, token)
+declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8)
 
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind uwtable presplitcoroutine "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { argmemonly nounwind }
-attributes #5 = { argmemonly nounwind readonly }
-attributes #6 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #7 = { nounwind }
-attributes #8 = { alwaysinline nounwind }
-attributes #9 = { noduplicate }
+attributes #3 = { nounwind uwtable presplitcoroutine }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
diff --git a/llvm/test/Transforms/DeadArgElim/dbginfo.ll b/llvm/test/Transforms/DeadArgElim/dbginfo.ll
index a27ca9d..f6313c7 100644
--- a/llvm/test/Transforms/DeadArgElim/dbginfo.ll
+++ b/llvm/test/Transforms/DeadArgElim/dbginfo.ll
@@ -21,14 +21,14 @@
 ; updated LLVM functions.
 
 ; Function Attrs: uwtable
-define void @_Z2f2v() #0 !dbg !4 {
+define void @_Z2f2v() !dbg !4 {
 entry:
   call void (i32, ...) @_ZL2f1iz(i32 1), !dbg !15
   ret void, !dbg !16
 }
 
 ; Function Attrs: nounwind uwtable
-define internal void @_ZL2f1iz(i32, ...) #1 !dbg !8 {
+define internal void @_ZL2f1iz(i32, ...) !dbg !8 {
 entry:
   call void @llvm.dbg.value(metadata i32 %0, metadata !17, metadata !18), !dbg !19
   ret void, !dbg !20
@@ -40,8 +40,6 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.value(metadata, metadata, metadata) #2
 
-attributes #0 = { uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
 
 !llvm.dbg.cu = !{!0}
diff --git a/llvm/test/Transforms/DeadStoreElimination/matrix-intrinsics.ll b/llvm/test/Transforms/DeadStoreElimination/matrix-intrinsics.ll
index ae3c746..2eaa275 100644
--- a/llvm/test/Transforms/DeadStoreElimination/matrix-intrinsics.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/matrix-intrinsics.ll
@@ -5,8 +5,8 @@ define void @dead_unstrided_store_non_matrix_load(ptr noalias %src, ptr noalias
 ; CHECK-LABEL: define void @dead_unstrided_store_non_matrix_load(
 ; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
 ; CHECK-NEXT:    [[L:%.*]] = load double, ptr [[SRC]], align 8
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -173,7 +173,6 @@ define void @dead_unstrided_store(ptr noalias %src, ptr noalias %dst) {
 ; CHECK-LABEL: define void @dead_unstrided_store(
 ; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
 ; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
 ; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
 ; CHECK-NEXT:    ret void
@@ -241,7 +240,6 @@ define void @dead_matrix_store_non_matrix_overwrite_unstrided(ptr noalias %src,
 ; CHECK-LABEL: define void @dead_matrix_store_non_matrix_overwrite_unstrided(
 ; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
 ; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
 ; CHECK-NEXT:    store <8 x double> zeroinitializer, ptr [[DST]], align 64
 ; CHECK-NEXT:    ret void
@@ -257,7 +255,6 @@ define void @dead_matrix_store_non_matrix_overwrite_strided(ptr noalias %src, pt
 ; CHECK-LABEL: define void @dead_matrix_store_non_matrix_overwrite_strided(
 ; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
 ; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 8, i1 false, i32 4, i32 2)
 ; CHECK-NEXT:    store <16 x double> zeroinitializer, ptr [[DST]], align 128
 ; CHECK-NEXT:    ret void
@@ -289,7 +286,6 @@ define void @live_matrix_store_non_matrix_overwrite_strided(ptr noalias %src, pt
 ; CHECK-LABEL: define void @live_matrix_store_non_matrix_overwrite_strided(
 ; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
 ; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 8, i1 false, i32 4, i32 2)
 ; CHECK-NEXT:    store <8 x double> zeroinitializer, ptr [[DST]], align 64
 ; CHECK-NEXT:    ret void
@@ -305,8 +301,6 @@ define void @dead_matrix_store_dimension_change(ptr noalias %src, ptr noalias %d
 ; CHECK-LABEL: define void @dead_matrix_store_dimension_change(
 ; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 8, i1 false, i32 4, i32 2)
-; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
 ; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v9f64.i32(<9 x double> zeroinitializer, ptr [[DST]], i32 3, i1 false, i32 3, i32 3)
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/DeadStoreElimination/mda-with-dbg-values.ll b/llvm/test/Transforms/DeadStoreElimination/mda-with-dbg-values.ll
index f66aa0f..2cec6b5 100644
--- a/llvm/test/Transforms/DeadStoreElimination/mda-with-dbg-values.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/mda-with-dbg-values.ll
@@ -14,7 +14,7 @@ target triple = "x86_64-unknown-linux-gnu"
 @g = common global [1 x i8] zeroinitializer, align 1, !dbg !0
 
 ; Function Attrs: noinline nounwind uwtable
-define void @foo() #0 !dbg !14 {
+define void @foo() !dbg !14 {
 entry:
   %i = alloca i8, align 1
   store i8 1, ptr %i, align 1, !dbg !19
@@ -37,7 +37,6 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
 ; Function Attrs: argmemonly nounwind
 declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1) #2
 
-attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone speculatable }
 attributes #2 = { argmemonly nounwind }
 
diff --git a/llvm/test/Transforms/FunctionImport/Inputs/funcimport_debug.ll b/llvm/test/Transforms/FunctionImport/Inputs/funcimport_debug.ll
index 4d3a241..628669d 100644
--- a/llvm/test/Transforms/FunctionImport/Inputs/funcimport_debug.ll
+++ b/llvm/test/Transforms/FunctionImport/Inputs/funcimport_debug.ll
@@ -3,13 +3,11 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
 target triple = "x86_64-unknown-linux-gnu"
 
 ; Function Attrs: nounwind uwtable
-define void @func() #0 !dbg !4 {
+define void @func() !dbg !4 {
 entry:
     ret void, !dbg !10
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
diff --git a/llvm/test/Transforms/FunctionImport/funcimport_debug.ll b/llvm/test/Transforms/FunctionImport/funcimport_debug.ll
index f449047..713dc4e 100644
--- a/llvm/test/Transforms/FunctionImport/funcimport_debug.ll
+++ b/llvm/test/Transforms/FunctionImport/funcimport_debug.ll
@@ -24,16 +24,13 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
 target triple = "x86_64-unknown-linux-gnu"
 
 ; Function Attrs: nounwind uwtable
-define i32 @main() #0 !dbg !4 {
+define i32 @main() !dbg !4 {
 entry:
   call void (...) @func(), !dbg !11
   ret i32 0, !dbg !12
 }
 
-declare void @func(...) #1
-
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+declare void @func(...)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !9}
diff --git a/llvm/test/Transforms/GCOVProfiling/exit-block.ll b/llvm/test/Transforms/GCOVProfiling/exit-block.ll
index 1840f04..543edac 100644
--- a/llvm/test/Transforms/GCOVProfiling/exit-block.ll
+++ b/llvm/test/Transforms/GCOVProfiling/exit-block.ll
@@ -13,7 +13,7 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = common global i32 0, align 4, !dbg !9
 
 ; Function Attrs: nounwind uwtable
-define void @test() #0 !dbg !4 {
+define void @test() !dbg !4 {
 entry:
   tail call void (...) @f() #2, !dbg !14
   %0 = load i32, ptr @A, align 4, !dbg !15
@@ -28,12 +28,10 @@ if.end:                                           ; preds = %entry, %if.then
   ret void, !dbg !18
 }
 
-declare void @f(...) #1
+declare void @f(...)
 
-declare void @g(...) #1
+declare void @g(...)
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind }
 
 !llvm.gcov = !{!19}
diff --git a/llvm/test/Transforms/GCOVProfiling/linezero.ll b/llvm/test/Transforms/GCOVProfiling/linezero.ll
index 8bfeabd..1eae413 100644
--- a/llvm/test/Transforms/GCOVProfiling/linezero.ll
+++ b/llvm/test/Transforms/GCOVProfiling/linezero.ll
@@ -9,7 +9,7 @@ target triple = "x86_64-unknown-linux-gnu"
 %struct.vector = type { i8 }
 
 ; Function Attrs: nounwind
-define i32 @_Z4testv() #0 !dbg !15 {
+define i32 @_Z4testv() !dbg !15 {
 entry:
   %retval = alloca i32, align 4
   %__range = alloca ptr, align 8
@@ -63,19 +63,19 @@ return:                                           ; No predecessors!
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
-declare void @_Z13TagFieldSpecsv() #2
+declare void @_Z13TagFieldSpecsv()
 
-declare ptr @_ZN6vector5beginEv(ptr) #2
+declare ptr @_ZN6vector5beginEv(ptr)
 
-declare ptr @_ZN6vector3endEv(ptr) #2
+declare ptr @_ZN6vector3endEv(ptr)
 
 ; Function Attrs: noreturn nounwind
-declare void @llvm.trap() #3
+declare void @llvm.trap()
 
 ; Function Attrs: nounwind
-define void @_Z2f1v() #0 !dbg !20 {
+define void @_Z2f1v() !dbg !20 {
 entry:
   br label %0
 
@@ -83,11 +83,6 @@ entry:
   ret void, !dbg !45
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { noreturn nounwind }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!23, !24}
 !llvm.gcov = !{!25}
diff --git a/llvm/test/Transforms/GCOVProfiling/split-indirectbr-critical-edges.ll b/llvm/test/Transforms/GCOVProfiling/split-indirectbr-critical-edges.ll
index 7f169e2..98bdd74 100644
--- a/llvm/test/Transforms/GCOVProfiling/split-indirectbr-critical-edges.ll
+++ b/llvm/test/Transforms/GCOVProfiling/split-indirectbr-critical-edges.ll
@@ -10,7 +10,7 @@
 ; CHECK-NEXT:    load {{.*}} @__llvm_gcov_ctr
 ; CHECK-NOT:     load {{.*}} @__llvm_gcov_ctr
 
-define dso_local i32 @cannot_split(ptr nocapture readonly %p) #0 !dbg !7 {
+define dso_local i32 @cannot_split(ptr nocapture readonly %p) !dbg !7 {
 entry:
   %targets = alloca <2 x ptr>, align 16
   store <2 x ptr> <ptr blockaddress(@cannot_split, %indirect), ptr blockaddress(@cannot_split, %end)>, ptr %targets, align 16, !dbg !9
@@ -42,8 +42,6 @@ end:                                              ; preds = %indirect
   ret i32 0, !dbg !22
 }
 
-attributes #0 = { norecurse nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 
diff --git a/llvm/test/Transforms/GVN/cond_br2.ll b/llvm/test/Transforms/GVN/cond_br2.ll
index 6ceec95..4811ad0 100644
--- a/llvm/test/Transforms/GVN/cond_br2.ll
+++ b/llvm/test/Transforms/GVN/cond_br2.ll
@@ -11,12 +11,12 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 %"union.llvm::SmallVectorBase::U" = type { x86_fp80 }
 
 ; Function Attrs: ssp uwtable
-define void @_Z4testv() #0 personality ptr @__gxx_personality_v0 {
+define void @_Z4testv() personality ptr @__gxx_personality_v0 {
 ; CHECK-LABEL: define void @_Z4testv(
-; CHECK-SAME: ) #[[ATTR0:[0-9]+]] personality ptr @__gxx_personality_v0 {
+; CHECK-SAME: ) personality ptr @__gxx_personality_v0 {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[SV:%.*]] = alloca %"class.llvm::SmallVector", align 16
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[SV]]) #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[SV]])
 ; CHECK-NEXT:    [[FIRSTEL_I_I_I_I_I_I:%.*]] = getelementptr inbounds %"class.llvm::SmallVector", ptr [[SV]], i64 0, i32 0, i32 0, i32 0, i32 0, i32 3
 ; CHECK-NEXT:    store ptr [[FIRSTEL_I_I_I_I_I_I]], ptr [[SV]], align 16, !tbaa [[ANYPTR_TBAA0:![0-9]+]]
 ; CHECK-NEXT:    [[ENDX_I_I_I_I_I_I:%.*]] = getelementptr inbounds %"class.llvm::SmallVector", ptr [[SV]], i64 0, i32 0, i32 0, i32 0, i32 0, i32 1
@@ -66,10 +66,10 @@ define void @_Z4testv() #0 personality ptr @__gxx_personality_v0 {
 ; CHECK-NEXT:    [[CMP_I_I_I_I19:%.*]] = icmp eq ptr [[TMP0]], [[FIRSTEL_I_I_I_I_I_I]]
 ; CHECK-NEXT:    br i1 [[CMP_I_I_I_I19]], label %[[_ZN4LLVM11SMALLVECTORIILJ8EED1EV_EXIT21:.*]], label %[[IF_THEN_I_I_I20:.*]]
 ; CHECK:       [[IF_THEN_I_I_I20]]:
-; CHECK-NEXT:    call void @free(ptr [[TMP0]]) #[[ATTR4]]
+; CHECK-NEXT:    call void @free(ptr [[TMP0]])
 ; CHECK-NEXT:    br label %[[_ZN4LLVM11SMALLVECTORIILJ8EED1EV_EXIT21]]
 ; CHECK:       [[_ZN4LLVM11SMALLVECTORIILJ8EED1EV_EXIT21]]:
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr [[SV]]) #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr [[SV]])
 ; CHECK-NEXT:    ret void
 ; CHECK:       [[LPAD]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = landingpad { ptr, i32 }
@@ -78,7 +78,7 @@ define void @_Z4testv() #0 personality ptr @__gxx_personality_v0 {
 ; CHECK-NEXT:    [[CMP_I_I_I_I:%.*]] = icmp eq ptr [[TMP2]], [[FIRSTEL_I_I_I_I_I_I]]
 ; CHECK-NEXT:    br i1 [[CMP_I_I_I_I]], label %[[EH_RESUME:.*]], label %[[IF_THEN_I_I_I:.*]]
 ; CHECK:       [[IF_THEN_I_I_I]]:
-; CHECK-NEXT:    call void @free(ptr [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    call void @free(ptr [[TMP2]])
 ; CHECK-NEXT:    br label %[[EH_RESUME]]
 ; CHECK:       [[EH_RESUME]]:
 ; CHECK-NEXT:    resume { ptr, i32 } [[TMP1]]
@@ -86,7 +86,7 @@ define void @_Z4testv() #0 personality ptr @__gxx_personality_v0 {
 
 entry:
   %sv = alloca %"class.llvm::SmallVector", align 16
-  call void @llvm.lifetime.start.p0(ptr %sv) #1
+  call void @llvm.lifetime.start.p0(ptr %sv)
   %FirstEl.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector", ptr %sv, i64 0, i32 0, i32 0, i32 0, i32 0, i32 3
   store ptr %FirstEl.i.i.i.i.i.i, ptr %sv, align 16, !tbaa !4
   %EndX.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector", ptr %sv, i64 0, i32 0, i32 0, i32 0, i32 0, i32 1
@@ -151,11 +151,11 @@ invoke.cont3:                                     ; preds = %invoke.cont2
   br i1 %cmp.i.i.i.i19, label %_ZN4llvm11SmallVectorIiLj8EED1Ev.exit21, label %if.then.i.i.i20
 
 if.then.i.i.i20:                                  ; preds = %invoke.cont3
-  call void @free(ptr %5) #1
+  call void @free(ptr %5)
   br label %_ZN4llvm11SmallVectorIiLj8EED1Ev.exit21
 
 _ZN4llvm11SmallVectorIiLj8EED1Ev.exit21:          ; preds = %invoke.cont3, %if.then.i.i.i20
-  call void @llvm.lifetime.end.p0(ptr %sv) #1
+  call void @llvm.lifetime.end.p0(ptr %sv)
   ret void
 
 lpad:                                             ; preds = %if.end.i14, %if.end.i, %invoke.cont2
@@ -166,7 +166,7 @@ lpad:                                             ; preds = %if.end.i14, %if.end
   br i1 %cmp.i.i.i.i, label %eh.resume, label %if.then.i.i.i
 
 if.then.i.i.i:                                    ; preds = %lpad
-  call void @free(ptr %7) #1
+  call void @free(ptr %7)
   br label %eh.resume
 
 eh.resume:                                        ; preds = %if.then.i.i.i, %lpad
@@ -174,24 +174,19 @@ eh.resume:                                        ; preds = %if.then.i.i.i, %lpa
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture)
 
 declare i32 @__gxx_personality_v0(...)
 
-declare void @_Z1gRN4llvm11SmallVectorIiLj8EEE(ptr) #2
+declare void @_Z1gRN4llvm11SmallVectorIiLj8EEE(ptr)
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(ptr nocapture)
 
-declare void @_ZN4llvm15SmallVectorBase8grow_podEmm(ptr, i64, i64) #2
+declare void @_ZN4llvm15SmallVectorBase8grow_podEmm(ptr, i64, i64)
 
 ; Function Attrs: nounwind
-declare void @free(ptr nocapture) #3
-
-attributes #0 = { ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind }
-attributes #2 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+declare void @free(ptr nocapture)
 
 !0 = !{!"any pointer", !1}
 !1 = !{!"omnipotent char", !2}
diff --git a/llvm/test/Transforms/GVN/matrix-intrinsics.ll b/llvm/test/Transforms/GVN/matrix-intrinsics.ll
index 78dbfe1..03bd45b 100644
--- a/llvm/test/Transforms/GVN/matrix-intrinsics.ll
+++ b/llvm/test/Transforms/GVN/matrix-intrinsics.ll
@@ -8,9 +8,8 @@ define void @redundant_unstrided_load(ptr %src) {
 ; CHECK-NEXT:    [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 8
 ; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 4, i1 false, i32 4, i32 2)
 ; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
-; CHECK-NEXT:    [[L_2:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 4, i1 false, i32 4, i32 2)
 ; CHECK-NEXT:    call void @use(<8 x double> [[L]])
-; CHECK-NEXT:    call void @use(<8 x double> [[L_2]])
+; CHECK-NEXT:    call void @use(<8 x double> [[L]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -30,9 +29,8 @@ define void @redundant_unstrided_load_non_matrix_store(ptr %src) {
 ; CHECK-NEXT:    [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 1
 ; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 4, i1 false, i32 4, i32 2)
 ; CHECK-NEXT:    store double 4.200000e+01, ptr [[SRC]], align 8
-; CHECK-NEXT:    [[L_2:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 4, i1 false, i32 4, i32 2)
 ; CHECK-NEXT:    call void @use(<8 x double> [[L]])
-; CHECK-NEXT:    call void @use(<8 x double> [[L_2]])
+; CHECK-NEXT:    call void @use(<8 x double> [[L]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -52,9 +50,8 @@ define void @redundant_strided_load(ptr %src) {
 ; CHECK-NEXT:    [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 16
 ; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 8, i1 false, i32 4, i32 2)
 ; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[SRC]], i32 8, i1 false, i32 4, i32 2)
-; CHECK-NEXT:    [[L_2:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 8, i1 false, i32 4, i32 2)
 ; CHECK-NEXT:    call void @use(<8 x double> [[L]])
-; CHECK-NEXT:    call void @use(<8 x double> [[L_2]])
+; CHECK-NEXT:    call void @use(<8 x double> [[L]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -75,9 +72,8 @@ define void @redundant_strided_load_non_matrix_store(ptr %src) {
 ; CHECK-NEXT:    [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 16
 ; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 8, i1 false, i32 4, i32 2)
 ; CHECK-NEXT:    store double 4.200000e+01, ptr [[SRC]], align 8
-; CHECK-NEXT:    [[L_2:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 8, i1 false, i32 4, i32 2)
 ; CHECK-NEXT:    call void @use(<8 x double> [[L]])
-; CHECK-NEXT:    call void @use(<8 x double> [[L_2]])
+; CHECK-NEXT:    call void @use(<8 x double> [[L]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/GVN/pr33549.ll b/llvm/test/Transforms/GVN/pr33549.ll
index a8ce37c..a1e28f7 100644
--- a/llvm/test/Transforms/GVN/pr33549.ll
+++ b/llvm/test/Transforms/GVN/pr33549.ll
@@ -4,9 +4,9 @@
 @Data = common local_unnamed_addr global [32 x i32] zeroinitializer, align 4
 
 ; Function Attrs: norecurse nounwind
-define void @testshl() local_unnamed_addr #0 {
+define void @testshl() local_unnamed_addr {
 ; CHECK-LABEL: define void @testshl(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ) local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_BODY]]:
@@ -78,8 +78,6 @@ for.end10:                                        ; preds = %for.inc8
   ret void
 }
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m7" "target-features"="+d16,+dsp,+fp-armv8,+hwdiv,+strict-align,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0, !1}
 !llvm.ident = !{!2}
 
diff --git a/llvm/test/Transforms/GVN/pr42605.ll b/llvm/test/Transforms/GVN/pr42605.ll
index 3e6241c..98b447f8 100644
--- a/llvm/test/Transforms/GVN/pr42605.ll
+++ b/llvm/test/Transforms/GVN/pr42605.ll
@@ -114,7 +114,7 @@ if.end:                                           ; preds = %if.then, %entry
   ret void
 }
 
-attributes #0 = { noinline norecurse nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline norecurse nounwind readonly uwtable }
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/Transforms/GVNHoist/hoist-unsafe-pr31729.ll b/llvm/test/Transforms/GVNHoist/hoist-unsafe-pr31729.ll
index b9c8537..082d016 100644
--- a/llvm/test/Transforms/GVNHoist/hoist-unsafe-pr31729.ll
+++ b/llvm/test/Transforms/GVNHoist/hoist-unsafe-pr31729.ll
@@ -14,7 +14,7 @@
 @res = common global i32 0, align 4
 
 ; Function Attrs:
-define i64 @func() #0 {
+define i64 @func() {
 entry:
   ret i64 1
 }
@@ -27,7 +27,7 @@ entry:
   %2 = load volatile i32, ptr @g_x_u, align 4
   %3 = load volatile i32, ptr @g_z_u, align 4
   %4 = load volatile i32, ptr @g_m, align 4
-  %call = call i64 @func() #4
+  %call = call i64 @func()
   %conv = sext i32 %1 to i64
   %cmp = icmp ne i64 %call, %conv
   br i1 %cmp, label %if.end, label %lor.lhs.false
@@ -42,7 +42,7 @@ if.then:
   br label %cleanup
 
 if.end:
-  %call4 = call i64 @func() #4
+  %call4 = call i64 @func()
   %conv5 = zext i32 %3 to i64
   %cmp6 = icmp ne i64 %call4, %conv5
   br i1 %cmp6, label %if.end14, label %lor.lhs.false8
@@ -57,7 +57,7 @@ if.then13:
   br label %cleanup
 
 if.end14:
-  %call15 = call i64 @func() #4
+  %call15 = call i64 @func()
   %cmp17 = icmp ne i64 %call15, %conv
   br i1 %cmp17, label %if.end25, label %lor.lhs.false19
 
@@ -77,5 +77,3 @@ cleanup:
   %retval.0 = phi i32 [ 0, %if.end25 ], [ 1, %if.then24 ], [ 1, %if.then13 ], [ 1, %if.then ]
   ret i32 %retval.0
 }
-
-attributes #0 = { minsize noinline nounwind optsize uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/GVNHoist/pr30499.ll b/llvm/test/Transforms/GVNHoist/pr30499.ll
index bd6f98c..6df9484b 100644
--- a/llvm/test/Transforms/GVNHoist/pr30499.ll
+++ b/llvm/test/Transforms/GVNHoist/pr30499.ll
@@ -1,6 +1,6 @@
 ; RUN: opt -S -passes=gvn-hoist < %s
 
-define void @_Z3fn2v() #0 {
+define void @_Z3fn2v() {
 entry:
   %a = alloca ptr, align 8
   %b = alloca i32, align 4
@@ -11,7 +11,7 @@ entry:
   br i1 %tobool, label %if.then, label %if.end
 
 if.then:                                          ; preds = %entry
-  %call = call i64 @_Z3fn1v() #2
+  %call = call i64 @_Z3fn1v()
   %conv = trunc i64 %call to i32
   store i32 %conv, ptr %b, align 4
   br label %if.end
@@ -23,8 +23,4 @@ if.end:                                           ; preds = %if.then, %entry
 }
 
 ; Function Attrs: nounwind readonly
-declare i64 @_Z3fn1v() #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readonly }
+declare i64 @_Z3fn1v()
diff --git a/llvm/test/Transforms/IndVarSimplify/X86/widen-nsw.ll b/llvm/test/Transforms/IndVarSimplify/X86/widen-nsw.ll
index 7cba30d..81a9323 100644
--- a/llvm/test/Transforms/IndVarSimplify/X86/widen-nsw.ll
+++ b/llvm/test/Transforms/IndVarSimplify/X86/widen-nsw.ll
@@ -4,7 +4,7 @@ target triple = "x86_64-apple-macosx"
 
 ; CHECK-LABEL: @test1
 ; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-define i32 @test1(ptr %a) #0 {
+define i32 @test1(ptr %a) {
 entry:
   br label %for.cond
 
@@ -25,5 +25,3 @@ for.body:                                         ; preds = %for.cond
 for.end:                                          ; preds = %for.cond
   ret i32 %sum.0
 }
-
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/Inline/always-inline-attr.ll b/llvm/test/Transforms/Inline/always-inline-attr.ll
index 08ea307..4e69822 100644
--- a/llvm/test/Transforms/Inline/always-inline-attr.ll
+++ b/llvm/test/Transforms/Inline/always-inline-attr.ll
@@ -6,7 +6,7 @@ target triple = "x86_64-grtev4-linux-gnu"
 
 ; After AlwaysInline the callee's attributes should be merged into caller's attibutes.
 
-; CHECK:  define dso_local <2 x i64> @foo(ptr byval(<8 x i64>) align 64 %0) #0
+; CHECK:  define dso_local <2 x i64> @foo(ptr byval(<8 x i64>) align 64 %0)
 ; CHECK:  attributes #0 = { mustprogress uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="512"
 
 ; Function Attrs: uwtable mustprogress
@@ -36,12 +36,10 @@ entry:
 }
 
 ; Function Attrs: nounwind readnone
-declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16) #2
-
-attributes #0 = { uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "prefer-vector-width"="128" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+aes,+avx,+avx2,+avx512bw,+avx512dq,+avx512f,+avx512vl,+bmi2,+cx16,+cx8,+f16c,+fma,+fxsr,+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { alwaysinline nounwind uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="512" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "prefer-vector-width"="128" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+aes,+avx,+avx2,+avx512f,+cx16,+cx8,+f16c,+fma,+fxsr,+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
+declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16)
 
+attributes #0 = { uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "prefer-vector-width"="128" }
+attributes #1 = { alwaysinline nounwind uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="512" "prefer-vector-width"="128" }
 
 !2 = !{!3, !3, i64 0}
 !3 = !{!"omnipotent char", !4, i64 0}
diff --git a/llvm/test/Transforms/Inline/debug-info-duplicate-calls.ll b/llvm/test/Transforms/Inline/debug-info-duplicate-calls.ll
index a4c4134..bf0dfa2 100644
--- a/llvm/test/Transforms/Inline/debug-info-duplicate-calls.ll
+++ b/llvm/test/Transforms/Inline/debug-info-duplicate-calls.ll
@@ -89,11 +89,10 @@ entry:
   ret void, !dbg !20
 }
 
-declare void @_Z2f1v() #2
+attributes #0 = { uwtable }
+attributes #1 = { alwaysinline inlinehint uwtable }
 
-attributes #0 = { uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { alwaysinline inlinehint uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+declare void @_Z2f1v()
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!10, !11}
diff --git a/llvm/test/Transforms/Inline/inline-vla.ll b/llvm/test/Transforms/Inline/inline-vla.ll
index 8e4bb3d..1e3c235 100644
--- a/llvm/test/Transforms/Inline/inline-vla.ll
+++ b/llvm/test/Transforms/Inline/inline-vla.ll
@@ -9,7 +9,7 @@
 @.str1 = private unnamed_addr constant [3 x i8] c"ab\00", align 1
 
 ; Function Attrs: nounwind ssp uwtable
-define i32 @main(i32 %argc, ptr nocapture readnone %argv) #0 {
+define i32 @main(i32 %argc, ptr nocapture readnone %argv) {
 entry:
   %data = alloca [2 x i8], align 1
   call fastcc void @memcpy2(ptr %data, ptr @.str, i64 1)
@@ -18,7 +18,7 @@ entry:
 }
 
 ; Function Attrs: inlinehint nounwind ssp uwtable
-define internal fastcc void @memcpy2(ptr nocapture %dst, ptr nocapture readonly %src, i64 %size) #1 {
+define internal fastcc void @memcpy2(ptr nocapture %dst, ptr nocapture readonly %src, i64 %size) {
 entry:
   %vla = alloca i64, i64 %size, align 16
   call void @llvm.memcpy.p0.p0.i64(ptr %vla, ptr %src, i64 %size, i1 false)
@@ -27,11 +27,7 @@ entry:
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1) #2
-
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { inlinehint nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
+declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1)
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/Transforms/Inline/optimization-remarks-hotness-threshold.ll b/llvm/test/Transforms/Inline/optimization-remarks-hotness-threshold.ll
index 3021935..2b4aedd 100644
--- a/llvm/test/Transforms/Inline/optimization-remarks-hotness-threshold.ll
+++ b/llvm/test/Transforms/Inline/optimization-remarks-hotness-threshold.ll
@@ -27,20 +27,18 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.11.0"
 
 ; Function Attrs: nounwind ssp uwtable
-define i32 @foo() #0 !dbg !7 {
+define i32 @foo() !dbg !7 {
 entry:
   ret i32 1, !dbg !9
 }
 
 ; Function Attrs: nounwind ssp uwtable
-define i32 @bar() #0 !dbg !10 {
+define i32 @bar() !dbg !10 {
 entry:
   %call = call i32 @foo(), !dbg !11
   ret i32 %call, !dbg !12
 }
 
-attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
diff --git a/llvm/test/Transforms/Inline/optimization-remarks-passed-deleted-callee-yaml.ll b/llvm/test/Transforms/Inline/optimization-remarks-passed-deleted-callee-yaml.ll
index d394713..f547c37 100644
--- a/llvm/test/Transforms/Inline/optimization-remarks-passed-deleted-callee-yaml.ll
+++ b/llvm/test/Transforms/Inline/optimization-remarks-passed-deleted-callee-yaml.ll
@@ -63,20 +63,18 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.11.0"
 
 ; Function Attrs: nounwind ssp uwtable
-define internal i32 @foo() #0 !dbg !7 {
+define internal i32 @foo() !dbg !7 {
 entry:
   ret i32 1, !dbg !9
 }
 
 ; Function Attrs: nounwind ssp uwtable
-define i32 @bar() #0 !dbg !10 !prof !13 {
+define i32 @bar() !dbg !10 !prof !13 {
 entry:
   %call = call i32 @foo(), !dbg !11
   ret i32 %call, !dbg !12
 }
 
-attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
diff --git a/llvm/test/Transforms/Inline/optimization-remarks-passed-yaml.ll b/llvm/test/Transforms/Inline/optimization-remarks-passed-yaml.ll
index b0a238f..64b305b 100644
--- a/llvm/test/Transforms/Inline/optimization-remarks-passed-yaml.ll
+++ b/llvm/test/Transforms/Inline/optimization-remarks-passed-yaml.ll
@@ -72,20 +72,18 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.11.0"
 
 ; Function Attrs: nounwind ssp uwtable
-define i32 @foo() #0 !dbg !7 {
+define i32 @foo() !dbg !7 {
 entry:
   ret i32 1, !dbg !9
 }
 
 ; Function Attrs: nounwind ssp uwtable
-define i32 @bar() #0 !dbg !10 !prof !13 {
+define i32 @bar() !dbg !10 !prof !13 {
 entry:
   %call = call i32 @foo(), !dbg !11
   ret i32 %call, !dbg !12
 }
 
-attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
diff --git a/llvm/test/Transforms/Inline/optimization-remarks.ll b/llvm/test/Transforms/Inline/optimization-remarks.ll
index bc1e690..135d835 100644
--- a/llvm/test/Transforms/Inline/optimization-remarks.ll
+++ b/llvm/test/Transforms/Inline/optimization-remarks.ll
@@ -81,9 +81,9 @@ entry:
   ret i32 %add
 }
 
-attributes #0 = { alwaysinline nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noinline nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { alwaysinline nounwind uwtable  }
+attributes #1 = { noinline nounwind uwtable }
+attributes #2 = { nounwind uwtable }
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-to-svbool-binops.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-to-svbool-binops.ll
index ecedbdb..abe1ed0 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-to-svbool-binops.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-to-svbool-binops.ll
@@ -124,6 +124,39 @@ define <vscale x 8 x i1> @try_combine_svbool_binop_orr(<vscale x 8 x i1> %a, <vs
   ret <vscale x 8 x i1> %t3
 }
 
+; Verify predicate cast does not hinder "isAllActive" knowledge.
+define <vscale x 8 x half> @try_combine_svbool_binop_fadd(<vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: @try_combine_svbool_binop_fadd(
+; CHECK-NEXT:    [[T2:%.*]] = fadd <vscale x 8 x half> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret <vscale x 8 x half> [[T2]]
+;
+  %t1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> splat (i1 true))
+  %t2 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %t1, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %t2
+}
+
+; Verify predicate cast does not hinder "isAllActive" knowledge.
+define <vscale x 4 x float> @try_combine_svbool_binop_fmul(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: @try_combine_svbool_binop_fmul(
+; CHECK-NEXT:    [[T2:%.*]] = fmul <vscale x 4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret <vscale x 4 x float> [[T2]]
+;
+  %t1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> splat (i1 true))
+  %t2 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1> %t1, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %t2
+}
+
+; Verify predicate cast does not hinder "isAllActive" knowledge.
+define <vscale x 2 x double> @try_combine_svbool_binop_fsub(<vscale x 2 x double> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: @try_combine_svbool_binop_fsub(
+; CHECK-NEXT:    [[T2:%.*]] = fsub <vscale x 2 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret <vscale x 2 x double> [[T2]]
+;
+  %t1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> splat (i1 true))
+  %t2 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fsub.nxv2f64(<vscale x 2 x i1> %t1, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %t2
+}
+
 declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1>)
 declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1>)
 declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1>)
diff --git a/llvm/test/Transforms/InstCombine/bitreverse-hang.ll b/llvm/test/Transforms/InstCombine/bitreverse-hang.ll
index bb01299..3f29509 100644
--- a/llvm/test/Transforms/InstCombine/bitreverse-hang.ll
+++ b/llvm/test/Transforms/InstCombine/bitreverse-hang.ll
@@ -21,7 +21,7 @@
 @b = common global i32 0, align 4
 
 ; CHECK: define i32 @fn1
-define i32 @fn1() #0 {
+define i32 @fn1() {
 entry:
   %b.promoted = load i32, ptr @b, align 4, !tbaa !2
   br label %for.body
@@ -40,8 +40,6 @@ for.end:                                          ; preds = %for.body
   ret i32 undef
 }
 
-attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
 
diff --git a/llvm/test/Transforms/InstCombine/constant-vector-insert.ll b/llvm/test/Transforms/InstCombine/constant-vector-insert.ll
new file mode 100644
index 0000000..2688540
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/constant-vector-insert.ll
@@ -0,0 +1,156 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=instcombine %s | FileCheck %s
+; RUN: opt -S -passes=instcombine %s \
+; RUN:   -use-constant-int-for-fixed-length-splat \
+; RUN    -use-constant-fp-for-fixed-length-splat \
+; RUN:   -use-constant-int-for-scalable-splat \
+; RUN:   -use-constant-fp-for-scalable-splat | FileCheck %s
+
+define <vscale x 4 x i32> @insert_div() {
+; CHECK-LABEL: @insert_div(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIV:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> splat (i32 3), i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[DIV]]
+;
+entry:
+  %0 = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> splat (i32 9), i64 0)
+  %div = udiv <vscale x 4 x i32> %0, splat (i32 3)
+  ret <vscale x 4 x i32> %div
+}
+
+define <vscale x 4 x i32> @insert_div_splat_lhs() {
+; CHECK-LABEL: @insert_div_splat_lhs(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIV:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> splat (i32 5), <4 x i32> splat (i32 2), i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[DIV]]
+;
+entry:
+  %0 = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> splat(i32 2), <4 x i32> splat (i32 5), i64 0)
+  %div = udiv <vscale x 4 x i32> splat (i32 10), %0
+  ret <vscale x 4 x i32> %div
+}
+
+define <vscale x 4 x i32> @insert_div_mixed_splat() {
+; CHECK-LABEL: @insert_div_mixed_splat(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIV:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> splat (i32 6), <4 x i32> splat (i32 3), i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[DIV]]
+;
+entry:
+  %0 = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> splat (i32 18), <4 x i32> splat (i32 9), i64 0)
+  %div = udiv <vscale x 4 x i32> %0, splat (i32 3)
+  ret <vscale x 4 x i32> %div
+}
+
+define <vscale x 4 x i32> @insert_mul() {
+; CHECK-LABEL: @insert_mul(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MUL:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> splat (i32 7), i64 4)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[MUL]]
+;
+entry:
+  %0 = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> splat (i32 1), i64 4)
+  %mul = mul <vscale x 4 x i32> %0, splat (i32 7)
+  ret <vscale x 4 x i32> %mul
+}
+
+define <vscale x 4 x i32> @insert_add() {
+; CHECK-LABEL: @insert_add(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> splat (i32 16), i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD]]
+;
+entry:
+  %0 = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> splat (i32 5), i64 0)
+  %add = add <vscale x 4 x i32> %0, splat (i32 11)
+  ret <vscale x 4 x i32> %add
+}
+
+define <vscale x 4 x i32> @insert_add_non_splat_subvector() {
+; CHECK-LABEL: @insert_add_non_splat_subvector(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> <i32 101, i32 102, i32 103, i32 104>, i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD]]
+;
+entry:
+  %0 = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 4>, i64 0)
+  %add = add <vscale x 4 x i32> %0, splat (i32 100)
+  ret <vscale x 4 x i32> %add
+}
+
+define <vscale x 4 x float> @insert_add_fp() {
+; CHECK-LABEL: @insert_add_fp(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> splat (float 6.250000e+00), <4 x float> splat (float 5.500000e+00), i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x float> [[ADD]]
+;
+entry:
+  %0 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> splat(float 1.25), <4 x float> splat (float 0.5), i64 0)
+  %add = fadd <vscale x 4 x float> %0, splat (float 5.0)
+  ret <vscale x 4 x float> %add
+}
+
+define <vscale x 8 x i32> @insert_add_scalable_subvector() {
+; CHECK-LABEL: @insert_add_scalable_subvector(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> splat (i32 20), <vscale x 4 x i32> splat (i32 -4), i64 0)
+; CHECK-NEXT:    ret <vscale x 8 x i32> [[ADD]]
+;
+entry:
+  %0 = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> splat(i32 16), <vscale x 4 x i32> splat (i32 -8), i64 0)
+  %add = add <vscale x 8 x i32> %0, splat (i32 4)
+  ret <vscale x 8 x i32> %add
+}
+
+define <vscale x 4 x i32> @insert_sub() {
+; CHECK-LABEL: @insert_sub(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SUB:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> zeroinitializer, i64 8)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[SUB]]
+;
+entry:
+  %0 = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> splat (i32 11), i64 8)
+  %sub = add <vscale x 4 x i32> %0, splat (i32 -11)
+  ret <vscale x 4 x i32> %sub
+}
+
+define <vscale x 4 x i32> @insert_and_partially_undef() {
+; CHECK-LABEL: @insert_and_partially_undef(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> zeroinitializer, <4 x i32> splat (i32 4), i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[AND]]
+;
+entry:
+  %0 = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> splat (i32 6), i64 0)
+  %and = and <vscale x 4 x i32> %0, splat (i32 4)
+  ret <vscale x 4 x i32> %and
+}
+
+define <vscale x 4 x i32> @insert_fold_chain() {
+; CHECK-LABEL: @insert_fold_chain(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> splat (i32 11), <4 x i32> splat (i32 8), i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD]]
+;
+entry:
+  %0 = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> splat (i32 21), <4 x i32> splat (i32 12), i64 0)
+  %div = udiv <vscale x 4 x i32> %0, splat (i32 3)
+  %add = add <vscale x 4 x i32> %div, splat (i32 4)
+  ret <vscale x 4 x i32> %add
+}
+
+; TODO: This could be folded more.
+define <vscale x 4 x i32> @insert_add_both_insert_vector() {
+; CHECK-LABEL: @insert_add_both_insert_vector(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> splat (i32 10), <4 x i32> splat (i32 5), i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> splat (i32 -1), <4 x i32> splat (i32 2), i64 0)
+; CHECK-NEXT:    [[ADD:%.*]] = add <vscale x 4 x i32> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD]]
+;
+entry:
+  %0 = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> splat(i32 10), <4 x i32> splat (i32 5), i64 0)
+  %1 = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> splat(i32 -1), <4 x i32> splat (i32 2), i64 0)
+  %add = add <vscale x 4 x i32> %0, %1
+  ret <vscale x 4 x i32> %add
+}
diff --git a/llvm/test/Transforms/InstCombine/ctlz-cttz.ll b/llvm/test/Transforms/InstCombine/ctlz-cttz.ll
new file mode 100644
index 0000000..871fb34
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/ctlz-cttz.ll
@@ -0,0 +1,145 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -S -passes=instcombine | FileCheck %s
+
+; ctpop(~i & (i - 1)) -> bitwidth - cttz(i, false)
+define i8 @ctlz_to_sub_bw_cttz(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[A0]], i1 false)
+; CHECK-NEXT:    [[CLZ:%.*]] = sub nuw nsw i8 8, [[TMP1]]
+; CHECK-NEXT:    ret i8 [[CLZ]]
+;
+  %dec = add i8 %a0, -1
+  %not = xor i8 %a0, -1
+  %and = and i8 %dec, %not
+  %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 false)
+  ret i8 %clz
+}
+
+define i8 @ctlz_to_sub_bw_cttz_poison(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz_poison(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[A0]], i1 false)
+; CHECK-NEXT:    [[CLZ:%.*]] = sub nuw nsw i8 8, [[TMP1]]
+; CHECK-NEXT:    ret i8 [[CLZ]]
+;
+  %dec = add i8 %a0, -1
+  %not = xor i8 %a0, -1
+  %and = and i8 %dec, %not
+  %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 true)
+  ret i8 %clz
+}
+
+define i8 @ctlz_to_sub_bw_cttz_different_add(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz_different_add(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT:    [[DEC:%.*]] = add i8 [[A0]], 1
+; CHECK-NEXT:    [[NOT:%.*]] = xor i8 [[A0]], -1
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[DEC]], [[NOT]]
+; CHECK-NEXT:    [[CLZ:%.*]] = tail call range(i8 0, 9) i8 @llvm.ctlz.i8(i8 [[AND]], i1 false)
+; CHECK-NEXT:    ret i8 [[CLZ]]
+;
+  %dec = add i8 %a0, 1
+  %not = xor i8 %a0, -1
+  %and = and i8 %dec, %not
+  %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 false)
+  ret i8 %clz
+}
+
+define i8 @ctlz_to_sub_bw_cttz_different_xor(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz_different_xor(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT:    [[DEC:%.*]] = add i8 [[A0]], -1
+; CHECK-NEXT:    [[NOT:%.*]] = xor i8 [[A0]], 1
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[DEC]], [[NOT]]
+; CHECK-NEXT:    [[CLZ:%.*]] = tail call range(i8 0, 9) i8 @llvm.ctlz.i8(i8 [[AND]], i1 false)
+; CHECK-NEXT:    ret i8 [[CLZ]]
+;
+  %dec = add i8 %a0, -1
+  %not = xor i8 %a0, 1
+  %and = and i8 %dec, %not
+  %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 false)
+  ret i8 %clz
+}
+
+declare void @use(i8)
+
+define i8 @ctlz_to_sub_bw_cttz_multi_use_dec(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz_multi_use_dec(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT:    [[DEC:%.*]] = add i8 [[A0]], -1
+; CHECK-NEXT:    call void @use(i8 [[DEC]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[A0]], i1 false)
+; CHECK-NEXT:    [[CLZ:%.*]] = sub nuw nsw i8 8, [[TMP1]]
+; CHECK-NEXT:    ret i8 [[CLZ]]
+;
+  %dec = add i8 %a0, -1
+  call void @use(i8 %dec)
+  %not = xor i8 %a0, -1
+  %and = and i8 %dec, %not
+  %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 false)
+  ret i8 %clz
+}
+
+define i8 @ctlz_to_sub_bw_cttz_multi_use_not(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz_multi_use_not(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT:    [[NOT:%.*]] = xor i8 [[A0]], -1
+; CHECK-NEXT:    call void @use(i8 [[NOT]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[A0]], i1 false)
+; CHECK-NEXT:    [[CLZ:%.*]] = sub nuw nsw i8 8, [[TMP1]]
+; CHECK-NEXT:    ret i8 [[CLZ]]
+;
+  %dec = add i8 %a0, -1
+  %not = xor i8 %a0, -1
+  call void @use(i8 %not)
+  %and = and i8 %dec, %not
+  %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 false)
+  ret i8 %clz
+}
+
+define i8 @ctlz_to_sub_bw_cttz_multi_use_and(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz_multi_use_and(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT:    [[DEC:%.*]] = add i8 [[A0]], -1
+; CHECK-NEXT:    [[NOT:%.*]] = xor i8 [[A0]], -1
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[DEC]], [[NOT]]
+; CHECK-NEXT:    call void @use(i8 [[AND]])
+; CHECK-NEXT:    [[CLZ:%.*]] = tail call range(i8 0, 9) i8 @llvm.ctlz.i8(i8 [[AND]], i1 false)
+; CHECK-NEXT:    ret i8 [[CLZ]]
+;
+  %dec = add i8 %a0, -1
+  %not = xor i8 %a0, -1
+  %and = and i8 %dec, %not
+  call void @use(i8 %and)
+  %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 false)
+  ret i8 %clz
+}
+
+define i8 @ctlz_to_sub_bw_cttz_commute_and(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz_commute_and(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[A0]], i1 false)
+; CHECK-NEXT:    [[CLZ:%.*]] = sub nuw nsw i8 8, [[TMP1]]
+; CHECK-NEXT:    ret i8 [[CLZ]]
+;
+  %dec = add i8 %a0, -1
+  %not = xor i8 %a0, -1
+  %and = and i8 %not, %dec
+  %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 false)
+  ret i8 %clz
+}
+
+define <2 x i8> @ctlz_to_sub_bw_cttz_vec_splat(<2 x i8> %a0) {
+; CHECK-LABEL: define <2 x i8> @ctlz_to_sub_bw_cttz_vec_splat(
+; CHECK-SAME: <2 x i8> [[A0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) <2 x i8> @llvm.cttz.v2i8(<2 x i8> [[A0]], i1 false)
+; CHECK-NEXT:    [[CLZ:%.*]] = sub nuw nsw <2 x i8> splat (i8 8), [[TMP1]]
+; CHECK-NEXT:    ret <2 x i8> [[CLZ]]
+;
+  %dec = add <2 x i8> %a0, <i8 -1, i8 -1>
+  %not = xor <2 x i8> %a0, <i8 -1, i8 -1>
+  %and = and <2 x i8> %dec, %not
+  %clz = tail call <2 x i8>@llvm.ctlz.v2i8(<2 x i8> %and, i1 false)
+  ret <2 x i8> %clz
+}
diff --git a/llvm/test/Transforms/InstCombine/intrinsic-select.ll b/llvm/test/Transforms/InstCombine/intrinsic-select.ll
index 2f1f9fc..fc9ab9f 100644
--- a/llvm/test/Transforms/InstCombine/intrinsic-select.ll
+++ b/llvm/test/Transforms/InstCombine/intrinsic-select.ll
@@ -222,8 +222,7 @@ declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
 
 define i32 @vec_to_scalar_select_scalar(i1 %b) {
 ; CHECK-LABEL: @vec_to_scalar_select_scalar(
-; CHECK-NEXT:    [[S:%.*]] = select i1 [[B:%.*]], <2 x i32> <i32 1, i32 2>, <2 x i32> <i32 3, i32 4>
-; CHECK-NEXT:    [[C:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[S]])
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[B:%.*]], i32 3, i32 7
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
   %s = select i1 %b, <2 x i32> <i32 1, i32 2>, <2 x i32> <i32 3, i32 4>
@@ -371,3 +370,36 @@ define float @test_fabs_select_multiuse_both_constant(i1 %cond, float %x) {
   %fabs = call float @llvm.fabs.f32(float %select)
   ret float %fabs
 }
+
+; Negative test: Don't replace with select between vector mask and zeroinitializer.
+define <16 x i1> @test_select_of_active_lane_mask_bound(i64 %base, i64 %n, i1 %cond) {
+; CHECK-LABEL: @test_select_of_active_lane_mask_bound(
+; CHECK-NEXT:    [[S:%.*]] = select i1 [[COND:%.*]], i64 [[N:%.*]], i64 0
+; CHECK-NEXT:    [[MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[BASE:%.*]], i64 [[S]])
+; CHECK-NEXT:    ret <16 x i1> [[MASK]]
+;
+  %s = select i1 %cond, i64 %n, i64 0
+  %mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 %base, i64 %s)
+  ret <16 x i1> %mask
+}
+
+define <16 x i1> @test_select_of_active_lane_mask_bound_both_constant(i64 %base, i64 %n, i1 %cond) {
+; CHECK-LABEL: @test_select_of_active_lane_mask_bound_both_constant(
+; CHECK-NEXT:    [[MASK:%.*]] = select i1 [[COND:%.*]], <16 x i1> splat (i1 true), <16 x i1> zeroinitializer
+; CHECK-NEXT:    ret <16 x i1> [[MASK]]
+;
+  %s = select i1 %cond, i64 16, i64 0
+  %mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 0, i64 %s)
+  ret <16 x i1> %mask
+}
+
+define { i64, i1 } @test_select_of_overflow_intrinsic_operand(i64 %n, i1 %cond) {
+; CHECK-LABEL: @test_select_of_overflow_intrinsic_operand(
+; CHECK-NEXT:    [[TMP1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[N:%.*]], i64 42)
+; CHECK-NEXT:    [[ADD_OVERFLOW:%.*]] = select i1 [[COND:%.*]], { i64, i1 } [[TMP1]], { i64, i1 } { i64 42, i1 false }
+; CHECK-NEXT:    ret { i64, i1 } [[ADD_OVERFLOW]]
+;
+  %s = select i1 %cond, i64 %n, i64 0
+  %add_overflow = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %s, i64 42)
+  ret { i64, i1 } %add_overflow
+}
diff --git a/llvm/test/Transforms/InstCombine/phi.ll b/llvm/test/Transforms/InstCombine/phi.ll
index 3454835..1c8b21c 100644
--- a/llvm/test/Transforms/InstCombine/phi.ll
+++ b/llvm/test/Transforms/InstCombine/phi.ll
@@ -3026,6 +3026,56 @@ join:
   ret i32 %umax
 }
 
+define i32 @cross_lane_intrinsic_over_phi(i1 %c, i1 %c2, <4 x i32> %a) {
+; CHECK-LABEL: @cross_lane_intrinsic_over_phi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF:%.*]], label [[JOIN:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[A:%.*]])
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[TMP0]], [[IF]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    call void @may_exit()
+; CHECK-NEXT:    ret i32 [[PHI]]
+;
+entry:
+  br i1 %c, label %if, label %join
+
+if:
+  br label %join
+
+join:
+  %phi = phi <4 x i32> [ %a, %if ], [ zeroinitializer, %entry ]
+  call void @may_exit()
+  %sum = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %phi)
+  ret i32 %sum
+}
+
+define { i64, i1 } @overflow_intrinsic_over_phi(i1 %c, i64 %a) {
+; CHECK-LABEL: @overflow_intrinsic_over_phi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF:%.*]], label [[JOIN:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[TMP0:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A:%.*]], i64 1)
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[PHI:%.*]] = phi { i64, i1 } [ [[TMP0]], [[IF]] ], [ { i64 1, i1 false }, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    call void @may_exit()
+; CHECK-NEXT:    ret { i64, i1 } [[PHI]]
+;
+entry:
+  br i1 %c, label %if, label %join
+
+if:
+  br label %join
+
+join:
+  %phi = phi i64 [ %a, %if ], [ 0, %entry ]
+  call void @may_exit()
+  %add_overflow = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %phi, i64 1)
+  ret { i64, i1 } %add_overflow
+}
+
 define i32 @multiple_intrinsics_with_multiple_phi_uses(i1 %c, i32 %arg) {
 ; CHECK-LABEL: @multiple_intrinsics_with_multiple_phi_uses(
 ; CHECK-NEXT:  entry:
diff --git a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
index 410c43c..f19cca8 100644
--- a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
+++ b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
@@ -40,3 +40,134 @@ define i128 @ptrtoaddr_sext(ptr %p) {
   %ext = sext i64 %p.addr to i128
   ret i128 %ext
 }
+
+define i64 @sub_ptrtoaddr(ptr %p, i64 %offset) {
+; CHECK-LABEL: define i64 @sub_ptrtoaddr(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[OFFSET:%.*]]) {
+; CHECK-NEXT:    ret i64 [[OFFSET]]
+;
+  %p2 = getelementptr i8, ptr %p, i64 %offset
+  %p.addr = ptrtoaddr ptr %p to i64
+  %p2.addr = ptrtoaddr ptr %p2 to i64
+  %sub = sub i64 %p2.addr, %p.addr
+  ret i64 %sub
+}
+
+define i64 @sub_ptrtoint_ptrtoaddr(ptr %p, i64 %offset) {
+; CHECK-LABEL: define i64 @sub_ptrtoint_ptrtoaddr(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[OFFSET:%.*]]) {
+; CHECK-NEXT:    ret i64 [[OFFSET]]
+;
+  %p2 = getelementptr i8, ptr %p, i64 %offset
+  %p.int = ptrtoint ptr %p to i64
+  %p2.addr = ptrtoaddr ptr %p2 to i64
+  %sub = sub i64 %p2.addr, %p.int
+  ret i64 %sub
+}
+
+define i32 @sub_ptrtoaddr_addrsize(ptr addrspace(1) %p, i32 %offset) {
+; CHECK-LABEL: define i32 @sub_ptrtoaddr_addrsize(
+; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[OFFSET:%.*]]) {
+; CHECK-NEXT:    ret i32 [[OFFSET]]
+;
+  %p2 = getelementptr i8, ptr addrspace(1) %p, i32 %offset
+  %p.addr = ptrtoaddr ptr addrspace(1) %p to i32
+  %p2.addr = ptrtoaddr ptr addrspace(1) %p2 to i32
+  %sub = sub i32 %p2.addr, %p.addr
+  ret i32 %sub
+}
+
+define i32 @sub_trunc_ptrtoaddr(ptr %p, i64 %offset) {
+; CHECK-LABEL: define i32 @sub_trunc_ptrtoaddr(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[OFFSET:%.*]]) {
+; CHECK-NEXT:    [[SUB:%.*]] = trunc i64 [[OFFSET]] to i32
+; CHECK-NEXT:    ret i32 [[SUB]]
+;
+  %p2 = getelementptr i8, ptr %p, i64 %offset
+  %p.addr = ptrtoaddr ptr %p to i64
+  %p2.addr = ptrtoaddr ptr %p2 to i64
+  %p.addr.trunc = trunc i64 %p.addr to i32
+  %p2.addr.trunc = trunc i64 %p2.addr to i32
+  %sub = sub i32 %p2.addr.trunc, %p.addr.trunc
+  ret i32 %sub
+}
+
+define i16 @sub_trunc_ptrtoaddr_addrsize(ptr addrspace(1) %p, i32 %offset) {
+; CHECK-LABEL: define i16 @sub_trunc_ptrtoaddr_addrsize(
+; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[OFFSET:%.*]]) {
+; CHECK-NEXT:    [[SUB:%.*]] = trunc i32 [[OFFSET]] to i16
+; CHECK-NEXT:    ret i16 [[SUB]]
+;
+  %p2 = getelementptr i8, ptr addrspace(1) %p, i32 %offset
+  %p.addr = ptrtoaddr ptr addrspace(1) %p to i32
+  %p2.addr = ptrtoaddr ptr addrspace(1) %p2 to i32
+  %p.addr.trunc = trunc i32 %p.addr to i16
+  %p2.addr.trunc = trunc i32 %p2.addr to i16
+  %sub = sub i16 %p2.addr.trunc, %p.addr.trunc
+  ret i16 %sub
+}
+
+define i16 @sub_trunc_ptrtoint_ptrtoaddr_addrsize(ptr addrspace(1) %p, i32 %offset) {
+; CHECK-LABEL: define i16 @sub_trunc_ptrtoint_ptrtoaddr_addrsize(
+; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[OFFSET:%.*]]) {
+; CHECK-NEXT:    [[SUB:%.*]] = trunc i32 [[OFFSET]] to i16
+; CHECK-NEXT:    ret i16 [[SUB]]
+;
+  %p2 = getelementptr i8, ptr addrspace(1) %p, i32 %offset
+  %p.int = ptrtoint ptr addrspace(1) %p to i64
+  %p2.addr = ptrtoaddr ptr addrspace(1) %p2 to i32
+  %p.int.trunc = trunc i64 %p.int to i16
+  %p2.addr.trunc = trunc i32 %p2.addr to i16
+  %sub = sub i16 %p2.addr.trunc, %p.int.trunc
+  ret i16 %sub
+}
+
+define i128 @sub_zext_ptrtoaddr(ptr %p, i64 %offset) {
+; CHECK-LABEL: define i128 @sub_zext_ptrtoaddr(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[OFFSET:%.*]]) {
+; CHECK-NEXT:    [[SUB:%.*]] = zext i64 [[OFFSET]] to i128
+; CHECK-NEXT:    ret i128 [[SUB]]
+;
+  %p2 = getelementptr nuw i8, ptr %p, i64 %offset
+  %p.addr = ptrtoaddr ptr %p to i64
+  %p2.addr = ptrtoaddr ptr %p2 to i64
+  %p.addr.ext = zext i64 %p.addr to i128
+  %p2.addr.ext = zext i64 %p2.addr to i128
+  %sub = sub i128 %p2.addr.ext, %p.addr.ext
+  ret i128 %sub
+}
+
+define i64 @sub_zext_ptrtoaddr_addrsize(ptr addrspace(1) %p, i32 %offset) {
+; CHECK-LABEL: define i64 @sub_zext_ptrtoaddr_addrsize(
+; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[OFFSET:%.*]]) {
+; CHECK-NEXT:    [[SUB:%.*]] = zext i32 [[OFFSET]] to i64
+; CHECK-NEXT:    ret i64 [[SUB]]
+;
+  %p2 = getelementptr nuw i8, ptr addrspace(1) %p, i32 %offset
+  %p.addr = ptrtoaddr ptr addrspace(1) %p to i32
+  %p2.addr = ptrtoaddr ptr addrspace(1) %p2 to i32
+  %p.addr.ext = zext i32 %p.addr to i64
+  %p2.addr.ext = zext i32 %p2.addr to i64
+  %sub = sub i64 %p2.addr.ext, %p.addr.ext
+  ret i64 %sub
+}
+
+define i128 @sub_zext_ptrtoint_ptrtoaddr_addrsize(ptr addrspace(1) %p, i32 %offset) {
+; CHECK-LABEL: define i128 @sub_zext_ptrtoint_ptrtoaddr_addrsize(
+; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[OFFSET:%.*]]) {
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr nuw i8, ptr addrspace(1) [[P]], i32 [[OFFSET]]
+; CHECK-NEXT:    [[P_INT:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64
+; CHECK-NEXT:    [[P2_ADDR:%.*]] = ptrtoaddr ptr addrspace(1) [[P2]] to i32
+; CHECK-NEXT:    [[P_INT_EXT:%.*]] = zext i64 [[P_INT]] to i128
+; CHECK-NEXT:    [[P2_ADDR_EXT:%.*]] = zext i32 [[P2_ADDR]] to i128
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i128 [[P2_ADDR_EXT]], [[P_INT_EXT]]
+; CHECK-NEXT:    ret i128 [[SUB]]
+;
+  %p2 = getelementptr nuw i8, ptr addrspace(1) %p, i32 %offset
+  %p.int = ptrtoint ptr addrspace(1) %p to i64
+  %p2.addr = ptrtoaddr ptr addrspace(1) %p2 to i32
+  %p.int.ext = zext i64 %p.int to i128
+  %p2.addr.ext = zext i32 %p2.addr to i128
+  %sub = sub i128 %p2.addr.ext, %p.int.ext
+  ret i128 %sub
+}
diff --git a/llvm/test/Transforms/InstCombine/scmp.ll b/llvm/test/Transforms/InstCombine/scmp.ll
index c0be5b9..2ae062cd 100644
--- a/llvm/test/Transforms/InstCombine/scmp.ll
+++ b/llvm/test/Transforms/InstCombine/scmp.ll
@@ -519,9 +519,7 @@ define <3 x i2> @scmp_unary_shuffle_ops(<3 x i8> %x, <3 x i8> %y) {
 define i32 @scmp_sgt_slt(i32 %a) {
 ; CHECK-LABEL: define i32 @scmp_sgt_slt(
 ; CHECK-SAME: i32 [[A:%.*]]) {
-; CHECK-NEXT:    [[A_LOBIT:%.*]] = ashr i32 [[A]], 31
-; CHECK-NEXT:    [[CMP_INV:%.*]] = icmp slt i32 [[A]], 1
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = select i1 [[CMP_INV]], i32 [[A_LOBIT]], i32 1
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[A]], i32 0)
 ; CHECK-NEXT:    ret i32 [[RETVAL_0]]
 ;
   %cmp = icmp sgt i32 %a, 0
@@ -747,3 +745,55 @@ define i8 @scmp_from_select_eq_and_gt_neg3(i32 %x, i32 %y) {
   %r = select i1 %eq, i8 0, i8 %sel1
   ret i8 %r
 }
+
+define i32 @scmp_ashr(i32 %a) {
+; CHECK-LABEL: define i32 @scmp_ashr(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[A]], i32 0)
+; CHECK-NEXT:    ret i32 [[RETVAL_0]]
+;
+  %a.lobit = ashr i32 %a, 31
+  %cmp.inv = icmp slt i32 %a, 1
+  %retval.0 = select i1 %cmp.inv, i32 %a.lobit, i32 1
+  ret i32 %retval.0
+}
+
+; select (icmp sgt X, 0), 1, ashr X, bitwidth-1 -> scmp(X, 0)
+define i8 @scmp_ashr_sgt_pattern(i8 %a) {
+; CHECK-LABEL: define i8 @scmp_ashr_sgt_pattern(
+; CHECK-SAME: i8 [[A:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call i8 @llvm.scmp.i8.i8(i8 [[A]], i8 0)
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %a.lobit = ashr i8 %a, 7
+  %cmp = icmp sgt i8 %a, 0
+  %retval = select i1 %cmp, i8 1, i8 %a.lobit
+  ret i8 %retval
+}
+
+; select (icmp slt X, 1), ashr X, bitwidth-1, 1 -> scmp(X, 0)
+define i8 @scmp_ashr_slt_pattern(i8 %a) {
+; CHECK-LABEL: define i8 @scmp_ashr_slt_pattern(
+; CHECK-SAME: i8 [[A:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call i8 @llvm.scmp.i8.i8(i8 [[A]], i8 0)
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %a.lobit = ashr i8 %a, 7
+  %cmp = icmp slt i8 %a, 1
+  %retval = select i1 %cmp, i8 %a.lobit, i8 1
+  ret i8 %retval
+}
+
+define i8 @scmp_ashr_slt_pattern_neg(i8 %a) {
+; CHECK-LABEL: define i8 @scmp_ashr_slt_pattern_neg(
+; CHECK-SAME: i8 [[A:%.*]]) {
+; CHECK-NEXT:    [[A_LOBIT:%.*]] = ashr i8 [[A]], 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[A]], 1
+; CHECK-NEXT:    [[RETVAL:%.*]] = select i1 [[CMP]], i8 [[A_LOBIT]], i8 1
+; CHECK-NEXT:    ret i8 [[RETVAL]]
+;
+  %a.lobit = ashr i8 %a, 4
+  %cmp = icmp slt i8 %a, 1
+  %retval = select i1 %cmp, i8 %a.lobit, i8 1
+  ret i8 %retval
+}
diff --git a/llvm/test/Transforms/InstCombine/select-extractelement-inseltpoison.ll b/llvm/test/Transforms/InstCombine/select-extractelement-inseltpoison.ll
index 2348490..2d06f42 100644
--- a/llvm/test/Transforms/InstCombine/select-extractelement-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/select-extractelement-inseltpoison.ll
@@ -208,6 +208,3 @@ define <4 x i32> @extract_cond_type_mismatch(<4 x i32> %x, <4 x i32> %y, <5 x i1
   %r = select i1 %cond, <4 x i32> %x, <4 x i32> %y
   ret <4 x i32> %r
 }
-
-
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/InstCombine/select-extractelement.ll b/llvm/test/Transforms/InstCombine/select-extractelement.ll
index 621d278..6f80b7d 100644
--- a/llvm/test/Transforms/InstCombine/select-extractelement.ll
+++ b/llvm/test/Transforms/InstCombine/select-extractelement.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -passes=instcombine < %s | FileCheck %s
 
-declare void @v4float_user(<4 x float>) #0
+declare void @v4float_user(<4 x float>)
 
-define float @extract_one_select(<4 x float> %a, <4 x float> %b, i32 %c) #0 {
+define float @extract_one_select(<4 x float> %a, <4 x float> %b, i32 %c) {
 ; CHECK-LABEL: @extract_one_select(
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[C:%.*]], 0
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP_NOT]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]]
@@ -17,7 +17,7 @@ define float @extract_one_select(<4 x float> %a, <4 x float> %b, i32 %c) #0 {
 }
 
 ; Multiple extractelements
-define <2 x float> @extract_two_select(<4 x float> %a, <4 x float> %b, i32 %c) #0 {
+define <2 x float> @extract_two_select(<4 x float> %a, <4 x float> %b, i32 %c) {
 ; CHECK-LABEL: @extract_two_select(
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[C:%.*]], 0
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP_NOT]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]]
@@ -34,7 +34,7 @@ define <2 x float> @extract_two_select(<4 x float> %a, <4 x float> %b, i32 %c) #
 }
 
 ; Select has an extra non-extractelement user, don't change it
-define float @extract_one_select_user(<4 x float> %a, <4 x float> %b, i32 %c) #0 {
+define float @extract_one_select_user(<4 x float> %a, <4 x float> %b, i32 %c) {
 ; CHECK-LABEL: @extract_one_select_user(
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[C:%.*]], 0
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP_NOT]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]]
@@ -49,7 +49,7 @@ define float @extract_one_select_user(<4 x float> %a, <4 x float> %b, i32 %c) #0
   ret float %extract
 }
 
-define float @extract_one_vselect_user(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define float @extract_one_vselect_user(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; CHECK-LABEL: @extract_one_vselect_user(
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq <4 x i32> [[C:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[SEL:%.*]] = select <4 x i1> [[CMP_NOT]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]]
@@ -67,7 +67,7 @@ define float @extract_one_vselect_user(<4 x float> %a, <4 x float> %b, <4 x i32>
 ; Do not convert the vector select into a scalar select. That would increase
 ; the instruction count and potentially obfuscate a vector min/max idiom.
 
-define float @extract_one_vselect(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define float @extract_one_vselect(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; CHECK-LABEL: @extract_one_vselect(
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq <4 x i32> [[C:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[SELECT:%.*]] = select <4 x i1> [[CMP_NOT]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]]
@@ -81,7 +81,7 @@ define float @extract_one_vselect(<4 x float> %a, <4 x float> %b, <4 x i32> %c)
 }
 
 ; Multiple extractelements from a vector select
-define <2 x float> @extract_two_vselect(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <2 x float> @extract_two_vselect(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; CHECK-LABEL: @extract_two_vselect(
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq <4 x i32> [[C:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[SEL:%.*]] = select <4 x i1> [[CMP_NOT]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]]
@@ -100,7 +100,7 @@ define <2 x float> @extract_two_vselect(<4 x float> %a, <4 x float> %b, <4 x i32
 ; The vector selects are not decomposed into scalar selects because that would increase
 ; the instruction count. Extract+insert is converted to non-lane-crossing shuffles.
 ; Test multiple extractelements
-define <4 x float> @simple_vector_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_vector_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; CHECK-LABEL: @simple_vector_select(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x i32> [[C:%.*]], i64 0
@@ -232,5 +232,3 @@ define i32 @inf_loop_partial_undef(<2 x i1> %a, <2 x i1> %b, <2 x i32> %x, <2 x
   %t11 = extractelement <2 x i32> %p, i32 0
   ret i32 %t11
 }
-
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/InstCombine/select_frexp.ll b/llvm/test/Transforms/InstCombine/select_frexp.ll
index d025aed..ccfcca1 100644
--- a/llvm/test/Transforms/InstCombine/select_frexp.ll
+++ b/llvm/test/Transforms/InstCombine/select_frexp.ll
@@ -115,10 +115,10 @@ define float @test_select_frexp_no_const(float %x, float %y, i1 %cond) {
 define i32 @test_select_frexp_extract_exp(float %x, i1 %cond) {
 ; CHECK-LABEL: define i32 @test_select_frexp_extract_exp(
 ; CHECK-SAME: float [[X:%.*]], i1 [[COND:%.*]]) {
-; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[COND]], float 1.000000e+00, float [[X]]
-; CHECK-NEXT:    [[FREXP:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[SEL]])
+; CHECK-NEXT:    [[FREXP:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[X]])
 ; CHECK-NEXT:    [[FREXP_1:%.*]] = extractvalue { float, i32 } [[FREXP]], 1
-; CHECK-NEXT:    ret i32 [[FREXP_1]]
+; CHECK-NEXT:    [[FREXP_2:%.*]] = select i1 [[COND]], i32 1, i32 [[FREXP_1]]
+; CHECK-NEXT:    ret i32 [[FREXP_2]]
 ;
   %sel = select i1 %cond, float 1.000000e+00, float %x
   %frexp = call { float, i32 } @llvm.frexp.f32.i32(float %sel)
@@ -132,7 +132,7 @@ define float @test_select_frexp_fast_math_select(float %x, i1 %cond) {
 ; CHECK-SAME: float [[X:%.*]], i1 [[COND:%.*]]) {
 ; CHECK-NEXT:    [[FREXP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[X]])
 ; CHECK-NEXT:    [[MANTISSA:%.*]] = extractvalue { float, i32 } [[FREXP1]], 0
-; CHECK-NEXT:    [[SELECT_FREXP:%.*]] = select nnan ninf nsz i1 [[COND]], float 5.000000e-01, float [[MANTISSA]]
+; CHECK-NEXT:    [[SELECT_FREXP:%.*]] = select i1 [[COND]], float 5.000000e-01, float [[MANTISSA]]
 ; CHECK-NEXT:    ret float [[SELECT_FREXP]]
 ;
   %sel = select nnan ninf nsz i1 %cond, float 1.000000e+00, float %x
diff --git a/llvm/test/Transforms/InstCombine/sub-gep.ll b/llvm/test/Transforms/InstCombine/sub-gep.ll
index 8eeaea1..ee70137 100644
--- a/llvm/test/Transforms/InstCombine/sub-gep.ll
+++ b/llvm/test/Transforms/InstCombine/sub-gep.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -passes=instcombine < %s | FileCheck %s
 
-target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-p2:32:32"
+target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-p2:32:32-p3:32:32:32:16"
 
 define i64 @test_inbounds(ptr %base, i64 %idx) {
 ; CHECK-LABEL: @test_inbounds(
@@ -505,6 +505,23 @@ define i64 @negative_zext_ptrtoint_sub_ptrtoint_as2_nuw(i32 %offset) {
   ret i64 %D
 }
 
+define i64 @negative_zext_ptrtoint_sub_zext_ptrtoint_as2_nuw_truncating(i32 %offset) {
+; CHECK-LABEL: @negative_zext_ptrtoint_sub_zext_ptrtoint_as2_nuw_truncating(
+; CHECK-NEXT:    [[A:%.*]] = getelementptr nuw bfloat, ptr addrspace(2) @Arr_as2, i32 [[OFFSET:%.*]]
+; CHECK-NEXT:    [[A_IDX:%.*]] = ptrtoint ptr addrspace(2) [[A]] to i32
+; CHECK-NEXT:    [[E:%.*]] = zext i32 [[A_IDX]] to i64
+; CHECK-NEXT:    [[D:%.*]] = zext i16 ptrtoint (ptr addrspace(2) @Arr_as2 to i16) to i64
+; CHECK-NEXT:    [[E1:%.*]] = sub nsw i64 [[E]], [[D]]
+; CHECK-NEXT:    ret i64 [[E1]]
+;
+  %A = getelementptr nuw bfloat, ptr addrspace(2) @Arr_as2, i32 %offset
+  %B = ptrtoint ptr addrspace(2) %A to i32
+  %C = zext i32 %B to i64
+  %D = zext i16 ptrtoint (ptr addrspace(2) @Arr_as2 to i16) to i64
+  %E = sub i64 %C, %D
+  ret i64 %E
+}
+
 define i64 @ptrtoint_sub_zext_ptrtoint_as2_inbounds_local(ptr addrspace(2) %p, i32 %offset) {
 ; CHECK-LABEL: @ptrtoint_sub_zext_ptrtoint_as2_inbounds_local(
 ; CHECK-NEXT:    [[A:%.*]] = getelementptr inbounds bfloat, ptr addrspace(2) [[P:%.*]], i32 [[OFFSET:%.*]]
@@ -614,6 +631,20 @@ define i64 @negative_zext_ptrtoint_sub_ptrtoint_as2_nuw_local(ptr addrspace(2) %
   ret i64 %D
 }
 
+define i64 @zext_ptrtoint_sub_ptrtoint_as3_nuw_local(ptr addrspace(3) %p, i16 %offset) {
+; CHECK-LABEL: @zext_ptrtoint_sub_ptrtoint_as3_nuw_local(
+; CHECK-NEXT:    [[SUB:%.*]] = zext i16 [[GEP_IDX:%.*]] to i64
+; CHECK-NEXT:    ret i64 [[SUB]]
+;
+  %gep = getelementptr nuw i8, ptr addrspace(3) %p, i16 %offset
+  %gep.int = ptrtoint ptr addrspace(3) %gep to i32
+  %p.int = ptrtoint ptr addrspace(3) %p to i32
+  %gep.int.ext = zext i32 %gep.int to i64
+  %p.int.ext = zext i32 %p.int to i64
+  %sub = sub i64 %gep.int.ext, %p.int.ext
+  ret i64 %sub
+}
+
 define i64 @test30(ptr %foo, i64 %i, i64 %j) {
 ; CHECK-LABEL: @test30(
 ; CHECK-NEXT:    [[GEP1_IDX:%.*]] = shl nsw i64 [[I:%.*]], 2
diff --git a/llvm/test/Transforms/JumpThreading/ddt-crash3.ll b/llvm/test/Transforms/JumpThreading/ddt-crash3.ll
index edaade32..26ba857 100644
--- a/llvm/test/Transforms/JumpThreading/ddt-crash3.ll
+++ b/llvm/test/Transforms/JumpThreading/ddt-crash3.ll
@@ -9,9 +9,9 @@ target triple = "x86_64-unknown-linux-gnu"
 @global.2 = external local_unnamed_addr global i64, align 8
 
 ; Function Attrs: norecurse noreturn nounwind uwtable
-define void @hoge() local_unnamed_addr #0 {
+define void @hoge() local_unnamed_addr {
 ; CHECK-LABEL: define void @hoge(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ) local_unnamed_addr {
 ; CHECK-NEXT:  [[BB:.*:]]
 ; CHECK-NEXT:    br label %[[BB1:.*]]
 ; CHECK:       [[BB1]]:
@@ -48,8 +48,6 @@ bb27:                                             ; preds = %bb1
   br label %bb26
 }
 
-attributes #0 = { norecurse noreturn nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 7.0.0 "}
diff --git a/llvm/test/Transforms/LICM/volatile-alias.ll b/llvm/test/Transforms/LICM/volatile-alias.ll
index 410f3be..35e8844 100644
--- a/llvm/test/Transforms/LICM/volatile-alias.ll
+++ b/llvm/test/Transforms/LICM/volatile-alias.ll
@@ -10,7 +10,7 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Function Attrs: nounwind uwtable
-define i32 @foo(ptr dereferenceable(4) nonnull %p, ptr %q, i32 %n) #0 {
+define i32 @foo(ptr dereferenceable(4) nonnull %p, ptr %q, i32 %n) {
 entry:
   %p.addr = alloca ptr, align 8
   %q.addr = alloca ptr, align 8
@@ -51,5 +51,3 @@ for.end:                                          ; preds = %for.cond
   %8 = load i32, ptr %s, align 4
   ret i32 %8
 }
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/LoopRotate/noalias.ll b/llvm/test/Transforms/LoopRotate/noalias.ll
index f709bba..d6b2414 100644
--- a/llvm/test/Transforms/LoopRotate/noalias.ll
+++ b/llvm/test/Transforms/LoopRotate/noalias.ll
@@ -146,11 +146,7 @@ for.end:                                          ; preds = %for.cond
 }
 
 ; Function Attrs: inaccessiblememonly nounwind
-declare void @llvm.experimental.noalias.scope.decl(metadata) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { inaccessiblememonly nounwind }
-attributes #2 = { nounwind readnone speculatable }
+declare void @llvm.experimental.noalias.scope.decl(metadata)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll
index 5423368..25ded4b 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll
@@ -157,4 +157,4 @@ bb:
   br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hawaii" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hawaii" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/pr17473.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/pr17473.ll
index 6c34d1b..a878fc2 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/X86/pr17473.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/X86/pr17473.ll
@@ -62,6 +62,6 @@ for.end:                                          ; preds = %fn3.exit
 ; Function Attrs: nounwind optsize
 declare i32 @printf(ptr nocapture readonly, ...) #1
 
-attributes #0 = { nounwind optsize ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind optsize ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #2 = { nounwind optsize }
diff --git a/llvm/test/Transforms/LoopStrengthReduce/pr18165.ll b/llvm/test/Transforms/LoopStrengthReduce/pr18165.ll
index 914ea7a..9c20685 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/pr18165.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/pr18165.ll
@@ -71,8 +71,8 @@ fn1.exit:                                         ; preds = %lor.end.i
 ; Function Attrs: nounwind optsize
 declare i32 @printf(ptr nocapture readonly, ...) #1
 
-attributes #0 = { nounwind optsize ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind optsize ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #2 = { nounwind optsize }
 
 !llvm.ident = !{!0}
diff --git a/llvm/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll b/llvm/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll
index 3364465..37e2f68 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll
@@ -46,7 +46,7 @@ for.body3:                                        ; preds = %for.body3, %for.bod
   br i1 %exitcond, label %for.cond.loopexit, label %for.body3
 }
 
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 
 !0 = !{!1, !2, i64 16}
 !1 = !{!"planet", !2, i64 0, !2, i64 8, !2, i64 16, !2, i64 24, !2, i64 32, !2, i64 40, !2, i64 48}
diff --git a/llvm/test/Transforms/LoopUnroll/runtime-epilog-debuginfo.ll b/llvm/test/Transforms/LoopUnroll/runtime-epilog-debuginfo.ll
index ee28aa1..df4dfa6 100644
--- a/llvm/test/Transforms/LoopUnroll/runtime-epilog-debuginfo.ll
+++ b/llvm/test/Transforms/LoopUnroll/runtime-epilog-debuginfo.ll
@@ -76,7 +76,7 @@ lee1.exit:                                        ; preds = %lee1.exit.loopexit,
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
 
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="arm7tdmi" "target-features"="+neon,+strict-align,+vfp3,-crypto,-fp-armv8,-fp16,-vfp4" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="arm7tdmi" "target-features"="+neon,+strict-align,+vfp3,-crypto,-fp-armv8,-fp16,-vfp4" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 
 !llvm.dbg.cu = !{!0}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
index 27ca414..199203a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
@@ -86,7 +86,8 @@ define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 {
 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction   %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
 ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 16: 41
+; CHECK: Cost of 1 for VF 16: EXPRESSION vp<%11> = ir<%sum> + partial.reduce.add (mul nuw nsw (ir<%1> zext to i64), (ir<%0> zext to i64))
+; CHECK: Cost for VF 16: 3
 ; CHECK: LV: Selecting VF: 16
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll b/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll
index 2d15431..8109d068 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll
@@ -6,8 +6,7 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-none-unknown-elf"
 
 define i32 @dotp(ptr %a, ptr %b) #0 {
-; CHECK-REGS-VP-NOT: LV(REG): Not considering vector loop of width vscale x 16 because it uses too many registers
-; CHECK-REGS-VP: LV: Selecting VF: vscale x 8.
+; CHECK-REGS-VP: LV: Selecting VF: vscale x 16.
 ;
 ; CHECK-NOREGS-VP: LV(REG): Not considering vector loop of width vscale x 8 because it uses too many registers
 ; CHECK-NOREGS-VP: LV(REG): Not considering vector loop of width vscale x 16 because it uses too many registers
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
index 3dfa6df..287226f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
@@ -31,9 +31,9 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEON-NEXT:    [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
+; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
 ; CHECK-NEON-NEXT:    [[TMP13:%.*]] = sub <16 x i32> zeroinitializer, [[TMP12]]
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP13]])
@@ -52,37 +52,34 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-SVE-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
 ; CHECK-SVE-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
 ; CHECK-SVE-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
-; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
 ; CHECK-SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-SVE:       vector.ph:
-; CHECK-SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
 ; CHECK-SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; CHECK-SVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-SVE:       vector.body:
 ; CHECK-SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-SVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
-; CHECK-SVE-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
-; CHECK-SVE-NEXT:    [[TMP17:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
-; CHECK-SVE-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]]
-; CHECK-SVE-NEXT:    [[TMP19]] = sub <vscale x 4 x i32> [[TMP17]], [[TMP18]]
-; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
+; CHECK-SVE-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP5:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]])
+; CHECK-SVE-NEXT:    [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP6]]
+; CHECK-SVE-NEXT:    [[TMP12:%.*]] = sub <16 x i32> zeroinitializer, [[TMP11]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP12]])
+; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-SVE-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-SVE-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-SVE:       middle.block:
-; CHECK-SVE-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
+; CHECK-SVE-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]])
 ; CHECK-SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; CHECK-SVE-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK-SVE:       scalar.ph:
@@ -114,10 +111,10 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP16]])
-; CHECK-SVE-MAXBW-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP11]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP18:%.*]] = sub <vscale x 8 x i32> zeroinitializer, [[TMP17]]
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP18]])
 ; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
@@ -184,9 +181,9 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEON-NEXT:    [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
+; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]])
 ; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -221,9 +218,9 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
 ; CHECK-SVE-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-SVE-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-SVE-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-SVE-NEXT:    [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]]
 ; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]])
+; CHECK-SVE-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-SVE-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP5]]
 ; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP10]])
 ; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -262,10 +259,10 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP16]])
-; CHECK-SVE-MAXBW-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP11]]
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP17]])
 ; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -331,11 +328,11 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEON-NEXT:    [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
 ; CHECK-NEON-NEXT:    [[TMP11:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP10]]
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
-; CHECK-NEON-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
+; CHECK-NEON-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEON-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP13]]
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP12]])
 ; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEON-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -352,37 +349,34 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-SVE-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
 ; CHECK-SVE-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
 ; CHECK-SVE-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
-; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
 ; CHECK-SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-SVE:       vector.ph:
-; CHECK-SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
 ; CHECK-SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; CHECK-SVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-SVE:       vector.body:
 ; CHECK-SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-SVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
-; CHECK-SVE-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
-; CHECK-SVE-NEXT:    [[TMP17:%.*]] = sub <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
-; CHECK-SVE-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]]
-; CHECK-SVE-NEXT:    [[TMP19]] = add <vscale x 4 x i32> [[TMP17]], [[TMP18]]
-; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
+; CHECK-SVE-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP5:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]]
+; CHECK-SVE-NEXT:    [[TMP6:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP5]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]])
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP11]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP12]])
+; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-SVE-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-SVE-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK-SVE:       middle.block:
-; CHECK-SVE-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
+; CHECK-SVE-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]])
 ; CHECK-SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; CHECK-SVE-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK-SVE:       scalar.ph:
@@ -414,11 +408,11 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP17:%.*]] = sub nsw <vscale x 8 x i32> zeroinitializer, [[TMP16]]
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP17]])
-; CHECK-SVE-MAXBW-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP12]]
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP18]])
 ; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -486,11 +480,11 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEON-NEXT:    [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
 ; CHECK-NEON-NEXT:    [[TMP11:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP10]]
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
-; CHECK-NEON-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
+; CHECK-NEON-NEXT:    [[TMP16:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEON-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP16]]
 ; CHECK-NEON-NEXT:    [[TMP13:%.*]] = sub <16 x i32> zeroinitializer, [[TMP12]]
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP13]])
 ; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -508,37 +502,35 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-SVE-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
 ; CHECK-SVE-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
 ; CHECK-SVE-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
-; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
 ; CHECK-SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-SVE:       vector.ph:
-; CHECK-SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
 ; CHECK-SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; CHECK-SVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-SVE:       vector.body:
 ; CHECK-SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-SVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
-; CHECK-SVE-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
-; CHECK-SVE-NEXT:    [[TMP17:%.*]] = sub <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
-; CHECK-SVE-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]]
-; CHECK-SVE-NEXT:    [[TMP19]] = sub <vscale x 4 x i32> [[TMP17]], [[TMP18]]
-; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
+; CHECK-SVE-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP5:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]]
+; CHECK-SVE-NEXT:    [[TMP6:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP5]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]])
+; CHECK-SVE-NEXT:    [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP10]]
+; CHECK-SVE-NEXT:    [[TMP13:%.*]] = sub <16 x i32> zeroinitializer, [[TMP12]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP13]])
+; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-SVE-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-SVE-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK-SVE:       middle.block:
-; CHECK-SVE-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]])
 ; CHECK-SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; CHECK-SVE-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK-SVE:       scalar.ph:
@@ -570,11 +562,11 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP17:%.*]] = sub nsw <vscale x 8 x i32> zeroinitializer, [[TMP16]]
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP17]])
-; CHECK-SVE-MAXBW-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP12]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP19:%.*]] = sub <vscale x 8 x i32> zeroinitializer, [[TMP18]]
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP19]])
 ; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
@@ -644,12 +636,12 @@ define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEON-NEXT:    [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
-; CHECK-NEON-NEXT:    [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
+; CHECK-NEON-NEXT:    [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEON-NEXT:    [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP15]]
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]])
-; CHECK-NEON-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP9]]
+; CHECK-NEON-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP15]]
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE4]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE3]], <16 x i32> [[TMP12]])
 ; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEON-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -683,9 +675,9 @@ define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
 ; CHECK-SVE-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-SVE-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-SVE-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-SVE-NEXT:    [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]]
 ; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]])
+; CHECK-SVE-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-SVE-NEXT:    [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP5]]
 ; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]])
 ; CHECK-SVE-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP4]], [[TMP5]]
@@ -726,12 +718,12 @@ define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP16]])
-; CHECK-SVE-MAXBW-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP11]]
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE3:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP17]])
-; CHECK-SVE-MAXBW-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP14]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP14]], [[TMP11]]
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE4]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE3]], <vscale x 8 x i32> [[TMP18]])
 ; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -802,13 +794,13 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEON-NEXT:    [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
 ; CHECK-NEON-NEXT:    [[TMP11:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP10]]
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
-; CHECK-NEON-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
+; CHECK-NEON-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEON-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP13]]
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP12]])
-; CHECK-NEON-NEXT:    [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP9]]
+; CHECK-NEON-NEXT:    [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP13]]
 ; CHECK-NEON-NEXT:    [[TMP15:%.*]] = sub <16 x i32> zeroinitializer, [[TMP14]]
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE4]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE3]], <16 x i32> [[TMP15]])
 ; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -826,39 +818,37 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-SVE-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
 ; CHECK-SVE-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
 ; CHECK-SVE-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
-; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
 ; CHECK-SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-SVE:       vector.ph:
-; CHECK-SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
 ; CHECK-SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; CHECK-SVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-SVE:       vector.body:
 ; CHECK-SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-SVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
-; CHECK-SVE-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
-; CHECK-SVE-NEXT:    [[TMP17:%.*]] = sub <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
-; CHECK-SVE-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]]
-; CHECK-SVE-NEXT:    [[TMP19:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[TMP18]]
-; CHECK-SVE-NEXT:    [[TMP20:%.*]] = mul nsw <vscale x 4 x i32> [[TMP14]], [[TMP15]]
-; CHECK-SVE-NEXT:    [[TMP21]] = sub <vscale x 4 x i32> [[TMP19]], [[TMP20]]
-; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
+; CHECK-SVE-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP5:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]]
+; CHECK-SVE-NEXT:    [[TMP6:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP5]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]])
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP13:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP11]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP13]])
+; CHECK-SVE-NEXT:    [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP4]], [[TMP11]]
+; CHECK-SVE-NEXT:    [[TMP10:%.*]] = sub <16 x i32> zeroinitializer, [[TMP14]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE4]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE3]], <16 x i32> [[TMP10]])
+; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-SVE-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-SVE-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK-SVE:       middle.block:
-; CHECK-SVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP21]])
+; CHECK-SVE-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE4]])
 ; CHECK-SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; CHECK-SVE-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK-SVE:       scalar.ph:
@@ -890,13 +880,13 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP17:%.*]] = sub nsw <vscale x 8 x i32> zeroinitializer, [[TMP16]]
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP17]])
-; CHECK-SVE-MAXBW-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP12]]
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE3:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP18]])
-; CHECK-SVE-MAXBW-NEXT:    [[TMP19:%.*]] = mul nsw <vscale x 8 x i32> [[TMP14]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP19:%.*]] = mul nsw <vscale x 8 x i32> [[TMP14]], [[TMP12]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP20:%.*]] = sub <vscale x 8 x i32> zeroinitializer, [[TMP19]]
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE4]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE3]], <vscale x 8 x i32> [[TMP20]])
 ; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
@@ -969,9 +959,9 @@ define i32 @chained_partial_reduce_madd_extadd(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEON-NEXT:    [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
+; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP9]])
 ; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEON-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1005,9 +995,9 @@ define i32 @chained_partial_reduce_madd_extadd(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
 ; CHECK-SVE-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-SVE-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-SVE-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-SVE-NEXT:    [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]]
 ; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]])
+; CHECK-SVE-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP5]])
 ; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-SVE-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1045,9 +1035,9 @@ define i32 @chained_partial_reduce_madd_extadd(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP16]])
+; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP15]])
 ; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1110,8 +1100,8 @@ define i32 @chained_partial_reduce_extadd_extadd(ptr %a, ptr %b, i32 %N) #0 {
 ; CHECK-NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-NEON-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-NEON-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEON-NEXT:    [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]])
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP6]])
 ; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEON-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1142,8 +1132,8 @@ define i32 @chained_partial_reduce_extadd_extadd(ptr %a, ptr %b, i32 %N) #0 {
 ; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
 ; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
 ; CHECK-SVE-NEXT:    [[TMP2:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-SVE-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP2]])
+; CHECK-SVE-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP3]])
 ; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-SVE-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1178,8 +1168,8 @@ define i32 @chained_partial_reduce_extadd_extadd(ptr %a, ptr %b, i32 %N) #0 {
 ; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
 ; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT:    [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP11]])
+; CHECK-SVE-MAXBW-NEXT:    [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE2]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP12]])
 ; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1240,10 +1230,10 @@ define i32 @chained_partial_reduce_extadd_madd(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-NEON-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
-; CHECK-NEON-NEXT:    [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP9]])
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP10]])
 ; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -1276,10 +1266,10 @@ define i32 @chained_partial_reduce_extadd_madd(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
 ; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
 ; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
-; CHECK-SVE-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-SVE-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-SVE-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]])
+; CHECK-SVE-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-SVE-NEXT:    [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]]
 ; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP6]])
 ; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -1316,10 +1306,10 @@ define i32 @chained_partial_reduce_extadd_madd(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
 ; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
 ; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
-; CHECK-SVE-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP15]])
+; CHECK-SVE-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP16]])
 ; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll
index b033f60..b430efc 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll
@@ -467,3 +467,83 @@ loop:
 exit:
   ret i32 %red.next
 }
+
+define i64 @partial_reduction_mul_two_users(i64 %n, ptr %a, i16 %b, i32 %c) {
+; CHECK-LABEL: define i64 @partial_reduction_mul_two_users(
+; CHECK-SAME: i64 [[N:%.*]], ptr [[A:%.*]], i16 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[B]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT]] to <8 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i32> [[TMP1]], [[TMP1]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr [[A]], align 2
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i64> @llvm.vector.partial.reduce.add.v4i64.v8i64(<4 x i64> [[VEC_PHI]], <8 x i64> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i32> [[TMP5]] to <8 x i64>
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[PARTIAL_REDUCE]])
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP6]], i32 7
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RES1:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[LOAD_EXT_EXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RES2:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i16, ptr [[A]], align 2
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[B]] to i32
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[CONV]], [[CONV]]
+; CHECK-NEXT:    [[MUL_EXT:%.*]] = zext i32 [[MUL]] to i64
+; CHECK-NEXT:    [[ADD]] = add i64 [[RES2]], [[MUL_EXT]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[MUL]], [[C]]
+; CHECK-NEXT:    [[LOAD_EXT:%.*]] = sext i16 [[LOAD]] to i32
+; CHECK-NEXT:    [[LOAD_EXT_EXT]] = sext i32 [[LOAD_EXT]] to i64
+; CHECK-NEXT:    [[EXITCOND740_NOT:%.*]] = icmp eq i64 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND740_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[ADD_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %res1 = phi i64 [ 0, %entry ], [ %load.ext.ext, %loop ]
+  %res2 = phi i64 [ 0, %entry ], [ %add, %loop ]
+  %load = load i16, ptr %a, align 2
+  %iv.next = add i64 %iv, 1
+  %conv = sext i16 %b to i32
+  %mul = mul i32 %conv, %conv
+  %mul.ext = zext i32 %mul to i64
+  %add = add i64 %res2, %mul.ext
+  %second_use = or i32 %mul, %c ; this value is otherwise unused, but that's sufficient for the test
+  %load.ext = sext i16 %load to i32
+  %load.ext.ext = sext i32 %load.ext to i64
+  %exitcond740.not = icmp eq i64 %iv, %n
+  br i1 %exitcond740.not, label %exit, label %loop
+
+exit:
+  ret i64 %add
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
index 8ece59a..d8f1a86 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
@@ -16,10 +16,10 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul <16 x i32> [[TMP8]], [[TMP5]]
 ; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP9]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -68,7 +68,6 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
 ; CHECK-NEXT:    [[IV_NEXT:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = sext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32>
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -77,11 +76,12 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <16 x i8> poison, i8 [[TMP2]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT2]], <16 x i8> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[BROADCAST_SPLAT3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], [[TMP1]]
 ; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[IV_NEXT]]
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[IV_NEXT]]
@@ -89,7 +89,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
 ; CHECK:       vec.epilog.iter.check:
 ; CHECK-NEXT:    [[IND_END6:%.*]] = add i64 [[IDX_NEG]], [[IV_NEXT]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF4:![0-9]+]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT]], [[WHILE_BODY]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP6]], [[WHILE_BODY]] ], [ 0, [[ENTRY]] ]
@@ -112,7 +112,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
 ; CHECK-NEXT:    [[TMP13]] = add <4 x i32> [[TMP14]], [[VEC_PHI9]]
 ; CHECK-NEXT:    [[INDEX_NEXT14]] = add nuw i64 [[INDEX9]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC5]]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]])
 ; CHECK-NEXT:    [[CMP_N15:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]]
@@ -136,7 +136,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
 ; CHECK-NEXT:    [[CMP_IV_NEG:%.*]] = icmp ugt i64 [[IV_NEG]], 0
 ; CHECK-NEXT:    [[CMP_IV:%.*]] = icmp ne i64 [[ACCUM1]], -1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = and i1 [[CMP_IV_NEG]], [[CMP_IV]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[WHILE_BODY1]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[WHILE_BODY1]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       while.end.loopexit:
 ; CHECK-NEXT:    [[RESULT:%.*]] = phi i32 [ [[ADD]], [[WHILE_BODY1]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret void
@@ -495,7 +495,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16)
 ; CHECK-NEXT:    [[TMP181:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP182:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
index 09b41fb..26e630f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
@@ -26,26 +26,26 @@ define i32 @sudot(ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
-; CHECK-NEXT:    [[TMP7:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]]
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP13]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP12:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
-; CHECK-NEXT:    [[TMP18:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = mul <vscale x 16 x i32> [[TMP12]], [[TMP11]]
-; CHECK-NEXT:    [[TMP19:%.*]] = mul <vscale x 16 x i32> [[TMP18]], [[TMP7]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP14]])
-; CHECK-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP19]])
+; CHECK-NEXT:    [[TMP18:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = mul <vscale x 16 x i32> [[TMP18]], [[TMP11]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP12]])
+; CHECK-NEXT:    [[TMP19:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
+; CHECK-NEXT:    [[TMP20:%.*]] = mul <vscale x 16 x i32> [[TMP19]], [[TMP14]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP20]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
-; CHECK-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
+; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
 ; CHECK:       scalar.ph:
@@ -62,8 +62,8 @@ define i32 @sudot(ptr %a, ptr %b) #0 {
 ; CHECK-NOI8MM-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-NOI8MM:       vector.body:
 ; CHECK-NOI8MM-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NOI8MM-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NOI8MM-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NOI8MM-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NOI8MM-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NOI8MM-NEXT:    [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3
@@ -71,25 +71,25 @@ define i32 @sudot(ptr %a, ptr %b) #0 {
 ; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP6]], align 1
 ; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
 ; CHECK-NOI8MM-NEXT:    [[TMP11:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-NOI8MM-NEXT:    [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP7:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-NOI8MM-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NOI8MM-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NOI8MM-NEXT:    [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 3
 ; CHECK-NOI8MM-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]]
 ; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
 ; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
-; CHECK-NOI8MM-NEXT:    [[TMP18:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
-; CHECK-NOI8MM-NEXT:    [[TMP19:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
-; CHECK-NOI8MM-NEXT:    [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP18]], [[TMP11]]
-; CHECK-NOI8MM-NEXT:    [[TMP21:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP12]]
-; CHECK-NOI8MM-NEXT:    [[TMP22]] = add <vscale x 8 x i32> [[TMP20]], [[VEC_PHI]]
-; CHECK-NOI8MM-NEXT:    [[TMP23]] = add <vscale x 8 x i32> [[TMP21]], [[VEC_PHI1]]
+; CHECK-NOI8MM-NEXT:    [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP21:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP14:%.*]] = mul <vscale x 8 x i32> [[TMP12]], [[TMP11]]
+; CHECK-NOI8MM-NEXT:    [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP21]], [[TMP7]]
+; CHECK-NOI8MM-NEXT:    [[TMP18]] = add <vscale x 8 x i32> [[TMP14]], [[VEC_PHI]]
+; CHECK-NOI8MM-NEXT:    [[TMP20]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]]
 ; CHECK-NOI8MM-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NOI8MM-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NOI8MM-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-NOI8MM:       middle.block:
-; CHECK-NOI8MM-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP23]], [[TMP22]]
-; CHECK-NOI8MM-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
+; CHECK-NOI8MM-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP20]], [[TMP18]]
+; CHECK-NOI8MM-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
 ; CHECK-NOI8MM-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NOI8MM-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
 ; CHECK-NOI8MM:       scalar.ph:
@@ -137,26 +137,26 @@ define i32 @usdot(ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
-; CHECK-NEXT:    [[TMP7:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]]
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP13]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP12:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
-; CHECK-NEXT:    [[TMP18:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = mul <vscale x 16 x i32> [[TMP12]], [[TMP11]]
-; CHECK-NEXT:    [[TMP19:%.*]] = mul <vscale x 16 x i32> [[TMP18]], [[TMP7]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP14]])
-; CHECK-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP19]])
+; CHECK-NEXT:    [[TMP18:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = mul <vscale x 16 x i32> [[TMP18]], [[TMP11]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP12]])
+; CHECK-NEXT:    [[TMP19:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
+; CHECK-NEXT:    [[TMP20:%.*]] = mul <vscale x 16 x i32> [[TMP19]], [[TMP14]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP20]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
-; CHECK-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
+; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
 ; CHECK:       scalar.ph:
@@ -173,8 +173,8 @@ define i32 @usdot(ptr %a, ptr %b) #0 {
 ; CHECK-NOI8MM-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-NOI8MM:       vector.body:
 ; CHECK-NOI8MM-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NOI8MM-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NOI8MM-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NOI8MM-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NOI8MM-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NOI8MM-NEXT:    [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3
@@ -182,25 +182,25 @@ define i32 @usdot(ptr %a, ptr %b) #0 {
 ; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP6]], align 1
 ; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
 ; CHECK-NOI8MM-NEXT:    [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-NOI8MM-NEXT:    [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP7:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-NOI8MM-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NOI8MM-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NOI8MM-NEXT:    [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 3
 ; CHECK-NOI8MM-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]]
 ; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
 ; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
-; CHECK-NOI8MM-NEXT:    [[TMP18:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
-; CHECK-NOI8MM-NEXT:    [[TMP19:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
-; CHECK-NOI8MM-NEXT:    [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP18]], [[TMP11]]
-; CHECK-NOI8MM-NEXT:    [[TMP21:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP12]]
-; CHECK-NOI8MM-NEXT:    [[TMP22]] = add <vscale x 8 x i32> [[TMP20]], [[VEC_PHI]]
-; CHECK-NOI8MM-NEXT:    [[TMP23]] = add <vscale x 8 x i32> [[TMP21]], [[VEC_PHI1]]
+; CHECK-NOI8MM-NEXT:    [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP21:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP14:%.*]] = mul <vscale x 8 x i32> [[TMP12]], [[TMP11]]
+; CHECK-NOI8MM-NEXT:    [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP21]], [[TMP7]]
+; CHECK-NOI8MM-NEXT:    [[TMP18]] = add <vscale x 8 x i32> [[TMP14]], [[VEC_PHI]]
+; CHECK-NOI8MM-NEXT:    [[TMP20]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]]
 ; CHECK-NOI8MM-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NOI8MM-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NOI8MM-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-NOI8MM:       middle.block:
-; CHECK-NOI8MM-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP23]], [[TMP22]]
-; CHECK-NOI8MM-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
+; CHECK-NOI8MM-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP20]], [[TMP18]]
+; CHECK-NOI8MM-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
 ; CHECK-NOI8MM-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NOI8MM-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
 ; CHECK-NOI8MM:       scalar.ph:
@@ -242,18 +242,18 @@ define i32 @sudot_neon(ptr %a, ptr %b) #1 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 16
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = mul <16 x i32> [[TMP8]], [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
-; CHECK-NEXT:    [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP11]])
+; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
+; CHECK-NEXT:    [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = mul <16 x i32> [[TMP10]], [[TMP8]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP14]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -337,18 +337,18 @@ define i32 @usdot_neon(ptr %a, ptr %b) #1 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 16
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = mul <16 x i32> [[TMP8]], [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
-; CHECK-NEXT:    [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP11]])
+; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
+; CHECK-NEXT:    [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = mul <16 x i32> [[TMP10]], [[TMP8]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP14]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
index 801eb81..b847631 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
@@ -18,10 +18,10 @@ define i32 @dotp(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -47,17 +47,17 @@ define i32 @dotp(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]]
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = mul <16 x i32> [[TMP7]], [[TMP10]]
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -80,10 +80,10 @@ define i32 @dotp(ptr %a, ptr %b) {
 ; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -720,26 +720,26 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
@@ -788,59 +788,59 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP38]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP16]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP14]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE1]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP19]])
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP20]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE1]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP20]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP19]], [[TMP17]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP21]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD13:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD14:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD15:%.*]] = load <16 x i8>, ptr [[TMP26]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = sext <16 x i8> [[WIDE_LOAD14]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP27]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD14]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = mul nsw <16 x i32> [[TMP24]], [[TMP28]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP29]])
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP30]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP30]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = mul nsw <16 x i32> [[TMP27]], [[TMP25]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP29]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD18:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD19:%.*]] = load <16 x i8>, ptr [[TMP32]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = sext <16 x i8> [[WIDE_LOAD18]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD20:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD21:%.*]] = load <16 x i8>, ptr [[TMP36]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = sext <16 x i8> [[WIDE_LOAD20]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP56:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP39:%.*]] = mul nsw <16 x i32> [[TMP33]], [[TMP37]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = sext <16 x i8> [[WIDE_LOAD18]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP56:%.*]] = sext <16 x i8> [[WIDE_LOAD20]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP40:%.*]] = mul nsw <16 x i32> [[TMP34]], [[TMP56]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP39]])
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP40]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP40]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = mul nsw <16 x i32> [[TMP35]], [[TMP33]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP37]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD24:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD25:%.*]] = load <16 x i8>, ptr [[TMP42]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP43:%.*]] = sext <16 x i8> [[WIDE_LOAD24]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP44:%.*]] = sext <16 x i8> [[WIDE_LOAD25]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD26:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD27:%.*]] = load <16 x i8>, ptr [[TMP46]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP47:%.*]] = sext <16 x i8> [[WIDE_LOAD26]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP48:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP49:%.*]] = mul nsw <16 x i32> [[TMP43]], [[TMP47]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP44:%.*]] = sext <16 x i8> [[WIDE_LOAD24]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP48:%.*]] = sext <16 x i8> [[WIDE_LOAD26]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP50:%.*]] = mul nsw <16 x i32> [[TMP44]], [[TMP48]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP49]])
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP50]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP50]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP43:%.*]] = sext <16 x i8> [[WIDE_LOAD25]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP41:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP45:%.*]] = mul nsw <16 x i32> [[TMP43]], [[TMP41]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP45]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP51]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -884,26 +884,26 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
 ; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
 ; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
@@ -2025,7 +2025,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP8]] = add <16 x i32> [[TMP7]], [[VEC_PHI]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]])
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15
@@ -2062,7 +2062,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14]] = add <16 x i32> [[TMP12]], [[VEC_PHI1]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP14]], [[TMP13]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
@@ -2091,7 +2091,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) {
 ; CHECK-MAXBW-NEXT:    [[TMP8]] = add <16 x i32> [[TMP7]], [[VEC_PHI]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-MAXBW-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]])
 ; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 1ace7d4..4636c1b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -18,11 +18,11 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX1]]
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP16]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX1]]
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP20]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = mul <16 x i32> [[TMP2]], [[TMP3]]
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 1024
@@ -47,17 +47,17 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP20]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP20]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX1]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP28]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP28]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP28]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = mul <16 x i32> [[TMP4]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = mul <16 x i32> [[TMP6]], [[TMP2]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP8]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP8]]
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP9]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 1024
@@ -75,26 +75,26 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
 ; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
 ; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP20]], [[TMP13]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP22]])
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = mul <vscale x 16 x i32> [[TMP4]], [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP6]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE5]])
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
 ; CHECK-MAXBW:       scalar.ph:
@@ -134,10 +134,10 @@ define i64 @not_dotp_i8_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %b
 ; CHECK-INTERLEAVE1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = mul nuw nsw <16 x i64> [[TMP1]], [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = mul nuw nsw <16 x i64> [[TMP0]], [[TMP1]]
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP2]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -157,54 +157,78 @@ define i64 @not_dotp_i8_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %b
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI2:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI3:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE14:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 48
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = mul nuw nsw <16 x i64> [[TMP1]], [[TMP0]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP2]])
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 48
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP18]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = mul nuw nsw <16 x i64> [[TMP2]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP4]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul nuw nsw <16 x i64> [[TMP5]], [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE6]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP7]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = zext <16 x i8> [[WIDE_LOAD10]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = mul nuw nsw <16 x i64> [[TMP12]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE13]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI2]], <16 x i64> [[TMP14]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = zext <16 x i8> [[WIDE_LOAD11]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = zext <16 x i8> [[WIDE_LOAD7]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = mul nuw nsw <16 x i64> [[TMP15]], [[TMP16]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE14]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI3]], <16 x i64> [[TMP17]])
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE6]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX15:%.*]] = add <2 x i64> [[PARTIAL_REDUCE13]], [[BIN_RDX]]
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX16:%.*]] = add <2 x i64> [[PARTIAL_REDUCE14]], [[BIN_RDX15]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX16]])
 ; CHECK-INTERLEAVED-NEXT:    br label [[FOR_EXIT:%.*]]
 ; CHECK-INTERLEAVED:       for.exit:
-; CHECK-INTERLEAVED-NEXT:    ret i64 [[TMP4]]
+; CHECK-INTERLEAVED-NEXT:    ret i64 [[TMP9]]
 ;
 ; CHECK-MAXBW-LABEL: define i64 @not_dotp_i8_to_i64_has_neon_dotprod(
 ; CHECK-MAXBW-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-MAXBW-NEXT:  entry:
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
-; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
-; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[B]], i64 [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-MAXBW-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i64>
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[NEXT_GEP1]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i64>
-; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = mul nuw nsw <vscale x 8 x i64> [[TMP13]], [[TMP11]]
-; CHECK-MAXBW-NEXT:    [[TMP15]] = add <vscale x 8 x i64> [[TMP14]], [[VEC_PHI]]
-; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = mul nuw nsw <16 x i64> [[TMP0]], [[TMP1]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP2]])
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-MAXBW-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> [[TMP15]])
-; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-MAXBW:       scalar.ph:
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
+; CHECK-MAXBW-NEXT:    br label [[FOR_EXIT:%.*]]
+; CHECK-MAXBW:       for.exit:
+; CHECK-MAXBW-NEXT:    ret i64 [[TMP4]]
 ;
 entry:
   br label %for.body
@@ -245,10 +269,10 @@ define i64 @not_dotp_i16_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %
 ; CHECK-INTERLEAVE1-NEXT:    [[OFFSET_IDX1:%.*]] = mul i64 [[INDEX]], 2
 ; CHECK-INTERLEAVE1-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX1]]
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i16>, ptr [[NEXT_GEP2]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD3]] to <8 x i64>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = mul nuw nsw <8 x i64> [[TMP1]], [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = zext <8 x i16> [[WIDE_LOAD3]] to <8 x i64>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = mul nuw nsw <8 x i64> [[TMP0]], [[TMP1]]
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP2]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -269,30 +293,50 @@ define i64 @not_dotp_i16_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI2:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI3:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE15:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
 ; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[OFFSET_IDX]]
 ; CHECK-INTERLEAVED-NEXT:    [[OFFSET_IDX2:%.*]] = mul i64 [[INDEX]], 2
 ; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX2]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 24
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[WIDE_LOAD4]] to <8 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i32 8
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x i16>, ptr [[TMP10]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD8:%.*]] = load <8 x i16>, ptr [[TMP11]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i32 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i32 24
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i16>, ptr [[NEXT_GEP3]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <8 x i16> [[WIDE_LOAD5]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD11:%.*]] = load <8 x i16>, ptr [[TMP18]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i16>, ptr [[TMP19]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[WIDE_LOAD5]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = mul nuw nsw <8 x i64> [[TMP2]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP4]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD6]] to <8 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = mul nuw nsw <8 x i64> [[TMP4]], [[TMP1]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul nuw nsw <8 x i64> [[TMP5]], [[TMP2]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP6]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = zext <8 x i16> [[WIDE_LOAD4]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul nuw nsw <8 x i64> [[TMP5]], [[TMP6]]
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE7]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI1]], <8 x i64> [[TMP7]])
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = zext <8 x i16> [[WIDE_LOAD11]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = zext <8 x i16> [[WIDE_LOAD7]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = mul nuw nsw <8 x i64> [[TMP12]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE14]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI2]], <8 x i64> [[TMP14]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = zext <8 x i16> [[WIDE_LOAD12]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = zext <8 x i16> [[WIDE_LOAD8]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = mul nuw nsw <8 x i64> [[TMP15]], [[TMP16]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE15]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI3]], <8 x i64> [[TMP17]])
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE7]], [[PARTIAL_REDUCE]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX16:%.*]] = add <2 x i64> [[PARTIAL_REDUCE14]], [[BIN_RDX]]
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX17:%.*]] = add <2 x i64> [[PARTIAL_REDUCE15]], [[BIN_RDX16]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX17]])
 ; CHECK-INTERLEAVED-NEXT:    br label [[FOR_EXIT:%.*]]
 ; CHECK-INTERLEAVED:       for.exit:
 ; CHECK-INTERLEAVED-NEXT:    ret i64 [[TMP9]]
@@ -302,36 +346,28 @@ define i64 @not_dotp_i16_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %
 ; CHECK-MAXBW-NEXT:  entry:
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = mul i64 [[N_VEC]], 2
-; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]]
-; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = mul i64 [[N_VEC]], 2
-; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
 ; CHECK-MAXBW-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[OFFSET_IDX]]
 ; CHECK-MAXBW-NEXT:    [[OFFSET_IDX1:%.*]] = mul i64 [[INDEX]], 2
 ; CHECK-MAXBW-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX1]]
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[NEXT_GEP]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = zext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i16>, ptr [[NEXT_GEP2]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = zext <vscale x 4 x i16> [[WIDE_LOAD3]] to <vscale x 4 x i64>
-; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = mul nuw nsw <vscale x 4 x i64> [[TMP15]], [[TMP13]]
-; CHECK-MAXBW-NEXT:    [[TMP17]] = add <vscale x 4 x i64> [[TMP16]], [[VEC_PHI]]
-; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i16>, ptr [[NEXT_GEP2]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = zext <8 x i16> [[WIDE_LOAD3]] to <8 x i64>
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = mul nuw nsw <8 x i64> [[TMP0]], [[TMP1]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP2]])
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-MAXBW-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP17]])
-; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-MAXBW:       scalar.ph:
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
+; CHECK-MAXBW-NEXT:    br label [[FOR_EXIT:%.*]]
+; CHECK-MAXBW:       for.exit:
+; CHECK-MAXBW-NEXT:    ret i64 [[TMP4]]
 ;
 entry:
   br label %for.body
@@ -687,7 +723,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-MAXBW-NEXT:    [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-MAXBW-NEXT:    br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP138]])
 ; CHECK-MAXBW-NEXT:    br label [[FOR_EXIT:%.*]]
@@ -835,7 +871,7 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = mul nuw i32 [[TMP20]], 8
@@ -964,7 +1000,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = add <vscale x 8 x i32> [[TMP20]], [[TMP19]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-MAXBW-NEXT:    [[TMP24:%.*]] = mul nuw i32 [[TMP23]], 8
@@ -1024,26 +1060,26 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = mul nsw <16 x i32> [[TMP12]], [[TMP23]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP12]]
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP13]])
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP14]], [[TMP15]]
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP16]])
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP18]]
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP19]])
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP21]]
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP22]])
@@ -1092,58 +1128,58 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP43]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = mul nsw <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP15]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP12]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = mul nsw <16 x i32> [[TMP16]], [[TMP13]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP17]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = mul nsw <16 x i32> [[TMP16]], [[TMP17]]
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP18]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD13:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD14:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD15:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD14]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP23]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP21]], [[TMP24]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP25]])
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD15:%.*]] = load <16 x i8>, ptr [[TMP20]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = sext <16 x i8> [[WIDE_LOAD14]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = mul nsw <16 x i32> [[TMP21]], [[TMP22]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP23]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP24]], [[TMP25]]
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP26]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD18:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD19:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD18]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD20:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD21:%.*]] = load <16 x i8>, ptr [[TMP30]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = sext <16 x i8> [[WIDE_LOAD20]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP48:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP31]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP29]], [[TMP48]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE22]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP33]])
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD21:%.*]] = load <16 x i8>, ptr [[TMP28]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = sext <16 x i8> [[WIDE_LOAD18]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD20]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP29]], [[TMP30]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE22]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP31]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP48:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP48]], [[TMP33]]
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE23]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP34]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD24:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD25:%.*]] = load <16 x i8>, ptr [[TMP35]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = sext <16 x i8> [[WIDE_LOAD24]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = sext <16 x i8> [[WIDE_LOAD25]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD26:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD27:%.*]] = load <16 x i8>, ptr [[TMP38]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP39:%.*]] = sext <16 x i8> [[WIDE_LOAD26]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP40:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP41:%.*]] = mul nsw <16 x i32> [[TMP36]], [[TMP39]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP42:%.*]] = mul nsw <16 x i32> [[TMP37]], [[TMP40]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE28]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP41]])
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD27:%.*]] = load <16 x i8>, ptr [[TMP36]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = sext <16 x i8> [[WIDE_LOAD24]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP38:%.*]] = sext <16 x i8> [[WIDE_LOAD26]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP39:%.*]] = mul nsw <16 x i32> [[TMP37]], [[TMP38]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE28]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP39]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP40:%.*]] = sext <16 x i8> [[WIDE_LOAD25]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP41:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP42:%.*]] = mul nsw <16 x i32> [[TMP40]], [[TMP41]]
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE29]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP42]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1165,21 +1201,21 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-MAXBW-NEXT:  entry:
 ; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
 ; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP1]]
 ; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
 ; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
 ; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI4:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE16:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI5:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE17:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI6:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI7:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[INDEX]], 1
@@ -1191,38 +1227,38 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = or disjoint i64 [[INDEX]], 3
 ; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]]
 ; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]]
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD9]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP31:%.*]] = mul nsw <vscale x 8 x i32> [[TMP29]], [[TMP23]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE11]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI7]], <vscale x 8 x i32> [[TMP31]])
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD12:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP37:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD12]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD14:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP43:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD14]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP45:%.*]] = mul nsw <vscale x 8 x i32> [[TMP37]], [[TMP43]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI6]], <vscale x 8 x i32> [[TMP45]])
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD18:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP51:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD18]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD20:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP57:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD20]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP59:%.*]] = mul nsw <vscale x 8 x i32> [[TMP51]], [[TMP57]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE17]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI5]], <vscale x 8 x i32> [[TMP59]])
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD24:%.*]] = load <vscale x 8 x i8>, ptr [[TMP16]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP65:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD24]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD26:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP71:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD26]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP73:%.*]] = mul nsw <vscale x 8 x i32> [[TMP65]], [[TMP71]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE16]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI4]], <vscale x 8 x i32> [[TMP73]])
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP27:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP32:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP33:%.*]] = mul nsw <vscale x 16 x i32> [[TMP27]], [[TMP32]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI3]], <vscale x 16 x i32> [[TMP33]])
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD5]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD6]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = mul nsw <vscale x 16 x i32> [[TMP18]], [[TMP19]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE7]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI2]], <vscale x 16 x i32> [[TMP20]])
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 16 x i8>, ptr [[TMP13]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD8]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD9]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = mul nsw <vscale x 16 x i32> [[TMP21]], [[TMP22]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE10]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP23]])
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD11:%.*]] = load <vscale x 16 x i8>, ptr [[TMP16]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD12:%.*]] = load <vscale x 16 x i8>, ptr [[TMP17]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP24:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD11]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP25:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD12]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP26:%.*]] = mul nsw <vscale x 16 x i32> [[TMP24]], [[TMP25]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE13]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP26]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP74:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE16]])
-; CHECK-MAXBW-NEXT:    [[TMP40:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE17]])
-; CHECK-MAXBW-NEXT:    [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]])
-; CHECK-MAXBW-NEXT:    [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE11]])
+; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE13]])
+; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE10]])
+; CHECK-MAXBW-NEXT:    [[TMP30:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE7]])
+; CHECK-MAXBW-NEXT:    [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-MAXBW:       scalar.ph:
@@ -1390,7 +1426,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = xor i1 [[TMP19]], true
-; CHECK-MAXBW-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-MAXBW-NEXT:    br label [[EXIT:%.*]]
@@ -1525,7 +1561,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP24]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP24]])
 ; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
@@ -1565,110 +1601,93 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:  entry:
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
-; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 41, [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 41, [[N_MOD_VF]]
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP7]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[INDEX]], 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD1]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP13]], [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP15]] = add <vscale x 2 x i64> [[VEC_PHI]], [[TMP14]]
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i64>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul nuw nsw <16 x i64> [[TMP3]], [[TMP4]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP5]])
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP15]])
-; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH:%.*]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVE1-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       scalar.ph:
 ;
 ; CHECK-INTERLEAVED-LABEL: define i64 @dotp_cost_disagreement(
 ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-INTERLEAVED-NEXT:  entry:
-; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 41, [[TMP1]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 41, [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 41, [[N_MOD_VF]]
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = shl nuw i64 [[TMP9]], 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i64 [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP7]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD2]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = add nuw nsw i64 [[INDEX]], 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP14]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = shl nuw i64 [[TMP17]], 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP15]], i64 [[TMP18]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i8>, ptr [[TMP15]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 2 x i8>, ptr [[TMP19]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD3]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD4]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP20]], [[TMP12]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP21]], [[TMP13]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP24]] = add <vscale x 2 x i64> [[VEC_PHI]], [[TMP22]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP25]] = add <vscale x 2 x i64> [[VEC_PHI1]], [[TMP23]]
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP15]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP15]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = mul nuw nsw <16 x i64> [[TMP5]], [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP13]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul nuw nsw <16 x i64> [[TMP8]], [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE5]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP10]])
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 2 x i64> [[TMP25]], [[TMP24]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK-INTERLEAVED:       scalar.ph:
 ;
 ; CHECK-MAXBW-LABEL: define i64 @dotp_cost_disagreement(
 ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-MAXBW-NEXT:  entry:
 ; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
 ; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 41, [[TMP1]]
 ; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
 ; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
 ; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 41, [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 41, [[N_MOD_VF]]
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i64>
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[INDEX]], 1
 ; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP10]]
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i64>
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = mul nuw nsw <vscale x 8 x i64> [[TMP8]], [[TMP9]]
-; CHECK-MAXBW-NEXT:    [[TMP14]] = add <vscale x 8 x i64> [[VEC_PHI]], [[TMP13]]
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i64>
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i64>
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = mul nuw nsw <vscale x 16 x i64> [[TMP12]], [[TMP8]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv16i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 16 x i64> [[TMP9]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> [[TMP14]])
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-MAXBW:       scalar.ph:
@@ -1971,7 +1990,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP11]] = add <vscale x 4 x i64> [[TMP10]], [[VEC_PHI]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP11]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -2104,7 +2123,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP11]] = add <vscale x 4 x i64> [[TMP10]], [[VEC_PHI]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP11]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -2160,10 +2179,10 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = mul nuw nsw <16 x i64> [[TMP6]], [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = mul nuw nsw <16 x i64> [[TMP5]], [[TMP6]]
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP10]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -2181,10 +2200,10 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
 ; CHECK-INTERLEAVED:       for.body.preheader:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
-; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
-; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = trunc i64 [[N_VEC]] to i32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
@@ -2194,19 +2213,29 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ [[TMP4]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE6:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP2]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul nuw nsw <16 x i64> [[TMP6]], [[TMP5]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP10]])
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[NEXT_GEP2]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = mul nuw nsw <16 x i64> [[TMP13]], [[TMP15]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP16]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = mul nuw nsw <16 x i64> [[TMP10]], [[TMP11]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE6]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP12]])
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE6]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
 ; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-INTERLEAVED:       scalar.ph:
@@ -2219,35 +2248,35 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
 ; CHECK-MAXBW:       for.body.preheader:
 ; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
 ; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 ; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
 ; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 16
 ; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = trunc i64 [[N_VEC]] to i32
 ; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[N_VEC]]
-; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = insertelement <vscale x 8 x i64> zeroinitializer, i64 [[COST]], i32 0
+; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[COST]], i32 0
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i64> [ [[TMP10]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ [[TMP12]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-MAXBW-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i64>
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[NEXT_GEP1]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i64>
-; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = mul nuw nsw <vscale x 8 x i64> [[TMP16]], [[TMP14]]
-; CHECK-MAXBW-NEXT:    [[TMP20]] = add <vscale x 8 x i64> [[TMP17]], [[VEC_PHI]]
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[NEXT_GEP]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[NEXT_GEP1]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i64>
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i64>
+; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = mul nuw nsw <vscale x 16 x i64> [[TMP14]], [[TMP10]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv16i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 16 x i64> [[TMP11]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> [[TMP20]])
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-MAXBW:       scalar.ph:
@@ -2319,17 +2348,16 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ [[TMP1]], [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI5:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI7:%.*]] = phi <4 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ [[TMP1]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI5:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI7:%.*]] = phi <4 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = shl nsw i64 [[INDEX]], 3
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP11]]
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_VEC:%.*]] = load <128 x i8>, ptr [[TMP12]], align 1
@@ -2341,42 +2369,43 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
 ; CHECK-INTERLEAVE1-NEXT:    [[STRIDED_VEC12:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 69, i32 77, i32 85, i32 93, i32 101, i32 109, i32 117, i32 125>
 ; CHECK-INTERLEAVE1-NEXT:    [[STRIDED_VEC13:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 70, i32 78, i32 86, i32 94, i32 102, i32 110, i32 118, i32 126>
 ; CHECK-INTERLEAVE1-NEXT:    [[STRIDED_VEC14:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63, i32 71, i32 79, i32 87, i32 95, i32 103, i32 111, i32 119, i32 127>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[STRIDED_VEC]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP13]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP15]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP14]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP29:%.*]] = sext <16 x i8> [[STRIDED_VEC8]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP29]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP18]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP16]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP35:%.*]] = sext <16 x i8> [[STRIDED_VEC]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP44:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = mul nsw <16 x i32> [[TMP35]], [[TMP44]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP13]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = sext <16 x i8> [[STRIDED_VEC8]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP14]], [[TMP44]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE15]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP16]])
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = sext <16 x i8> [[STRIDED_VEC9]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP31]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = sext <16 x i8> [[STRIDED_VEC10]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = mul nsw <16 x i32> [[TMP19]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP24]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP20]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP32:%.*]] = sext <16 x i8> [[STRIDED_VEC11]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP32]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP27]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP22]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[STRIDED_VEC12]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP30]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP34]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = sext <16 x i8> [[STRIDED_VEC13]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP25]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP33]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP35:%.*]] = sext <16 x i8> [[STRIDED_VEC14]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP28:%.*]] = mul nsw <16 x i32> [[TMP35]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP36]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP44]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP19]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[STRIDED_VEC10]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP44]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP22]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[STRIDED_VEC11]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP44]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE18]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP25]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = sext <16 x i8> [[STRIDED_VEC12]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP28:%.*]] = mul nsw <16 x i32> [[TMP26]], [[TMP44]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE19]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP28]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP29:%.*]] = sext <16 x i8> [[STRIDED_VEC13]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP29]], [[TMP44]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE20]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP31]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP32:%.*]] = sext <16 x i8> [[STRIDED_VEC14]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP32]], [[TMP44]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP34]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP36]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP33]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP40:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP30]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP27]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP24]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP43:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP21]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP44:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP18]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP15]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE19]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE18]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP40:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE17]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE16]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE15]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP43:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
 ; CHECK-INTERLEAVE1:       scalar.ph:
@@ -2429,7 +2458,6 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI7:%.*]] = phi <4 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = shl nsw i64 [[INDEX]], 3
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP11]]
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_VEC:%.*]] = load <128 x i8>, ptr [[TMP12]], align 1
@@ -2441,42 +2469,43 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
 ; CHECK-INTERLEAVED-NEXT:    [[STRIDED_VEC12:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 69, i32 77, i32 85, i32 93, i32 101, i32 109, i32 117, i32 125>
 ; CHECK-INTERLEAVED-NEXT:    [[STRIDED_VEC13:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 70, i32 78, i32 86, i32 94, i32 102, i32 110, i32 118, i32 126>
 ; CHECK-INTERLEAVED-NEXT:    [[STRIDED_VEC14:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63, i32 71, i32 79, i32 87, i32 95, i32 103, i32 111, i32 119, i32 127>
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[STRIDED_VEC]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP13]], [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP14]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = sext <16 x i8> [[STRIDED_VEC8]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP10]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = sext <16 x i8> [[STRIDED_VEC]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP44:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = mul nsw <16 x i32> [[TMP35]], [[TMP44]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP13]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = sext <16 x i8> [[STRIDED_VEC8]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP14]], [[TMP44]]
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE15]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP16]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = sext <16 x i8> [[STRIDED_VEC9]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP18]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = sext <16 x i8> [[STRIDED_VEC10]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = mul nsw <16 x i32> [[TMP19]], [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP20]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = sext <16 x i8> [[STRIDED_VEC11]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP21]], [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE18]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP22]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[STRIDED_VEC12]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE19]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP24]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = sext <16 x i8> [[STRIDED_VEC13]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP25]], [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE20]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = sext <16 x i8> [[STRIDED_VEC14]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = mul nsw <16 x i32> [[TMP27]], [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP44]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP19]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[STRIDED_VEC10]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP44]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP22]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[STRIDED_VEC11]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP44]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE18]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP25]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = sext <16 x i8> [[STRIDED_VEC12]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = mul nsw <16 x i32> [[TMP26]], [[TMP44]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE19]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP28]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP45:%.*]] = sext <16 x i8> [[STRIDED_VEC13]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP45]], [[TMP44]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE20]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP31]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = sext <16 x i8> [[STRIDED_VEC14]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP32]], [[TMP44]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP34]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE19]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE18]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE17]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE16]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE15]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE19]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE18]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP40:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE17]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE16]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE15]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP43:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
 ; CHECK-INTERLEAVED:       scalar.ph:
@@ -2529,7 +2558,6 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
 ; CHECK-MAXBW-NEXT:    [[VEC_PHI7:%.*]] = phi <4 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = shl nsw i64 [[INDEX]], 3
 ; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP11]]
 ; CHECK-MAXBW-NEXT:    [[WIDE_VEC:%.*]] = load <128 x i8>, ptr [[TMP12]], align 1
@@ -2541,42 +2569,43 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
 ; CHECK-MAXBW-NEXT:    [[STRIDED_VEC12:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 69, i32 77, i32 85, i32 93, i32 101, i32 109, i32 117, i32 125>
 ; CHECK-MAXBW-NEXT:    [[STRIDED_VEC13:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 70, i32 78, i32 86, i32 94, i32 102, i32 110, i32 118, i32 126>
 ; CHECK-MAXBW-NEXT:    [[STRIDED_VEC14:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63, i32 71, i32 79, i32 87, i32 95, i32 103, i32 111, i32 119, i32 127>
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[STRIDED_VEC]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP13]], [[TMP10]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP14]])
-; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = sext <16 x i8> [[STRIDED_VEC8]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP10]]
+; CHECK-MAXBW-NEXT:    [[TMP35:%.*]] = sext <16 x i8> [[STRIDED_VEC]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP44:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = mul nsw <16 x i32> [[TMP35]], [[TMP44]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP13]])
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = sext <16 x i8> [[STRIDED_VEC8]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP14]], [[TMP44]]
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE15]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP16]])
 ; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = sext <16 x i8> [[STRIDED_VEC9]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP10]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP18]])
-; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = sext <16 x i8> [[STRIDED_VEC10]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = mul nsw <16 x i32> [[TMP19]], [[TMP10]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP20]])
-; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = sext <16 x i8> [[STRIDED_VEC11]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP21]], [[TMP10]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE18]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP22]])
-; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[STRIDED_VEC12]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP24:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP10]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE19]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP24]])
-; CHECK-MAXBW-NEXT:    [[TMP25:%.*]] = sext <16 x i8> [[STRIDED_VEC13]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP25]], [[TMP10]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE20]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
-; CHECK-MAXBW-NEXT:    [[TMP27:%.*]] = sext <16 x i8> [[STRIDED_VEC14]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = mul nsw <16 x i32> [[TMP27]], [[TMP10]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]])
+; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP44]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP19]])
+; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[STRIDED_VEC10]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP44]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP22]])
+; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[STRIDED_VEC11]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP25:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP44]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE18]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP25]])
+; CHECK-MAXBW-NEXT:    [[TMP26:%.*]] = sext <16 x i8> [[STRIDED_VEC12]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = mul nsw <16 x i32> [[TMP26]], [[TMP44]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE19]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP28]])
+; CHECK-MAXBW-NEXT:    [[TMP45:%.*]] = sext <16 x i8> [[STRIDED_VEC13]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP45]], [[TMP44]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE20]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP31]])
+; CHECK-MAXBW-NEXT:    [[TMP32:%.*]] = sext <16 x i8> [[STRIDED_VEC14]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP32]], [[TMP44]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP34]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP30:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]])
-; CHECK-MAXBW-NEXT:    [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]])
-; CHECK-MAXBW-NEXT:    [[TMP32:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE19]])
-; CHECK-MAXBW-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE18]])
-; CHECK-MAXBW-NEXT:    [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE17]])
-; CHECK-MAXBW-NEXT:    [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE16]])
-; CHECK-MAXBW-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE15]])
-; CHECK-MAXBW-NEXT:    [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-MAXBW-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]])
+; CHECK-MAXBW-NEXT:    [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]])
+; CHECK-MAXBW-NEXT:    [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE19]])
+; CHECK-MAXBW-NEXT:    [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE18]])
+; CHECK-MAXBW-NEXT:    [[TMP40:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE17]])
+; CHECK-MAXBW-NEXT:    [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE16]])
+; CHECK-MAXBW-NEXT:    [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE15]])
+; CHECK-MAXBW-NEXT:    [[TMP43:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
 ; CHECK-MAXBW:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll
index b308b92..bd9fae6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll
@@ -23,12 +23,12 @@ define i32 @partial_reduce_with_non_constant_start_value(ptr %src, i32 %rdx.star
 ; IC2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
 ; IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; IC2-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
-; IC2-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; IC2-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
-; IC2-NEXT:    [[TMP6:%.*]] = mul nuw nsw <16 x i32> [[TMP4]], [[TMP4]]
+; IC2-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; IC2-NEXT:    [[TMP7:%.*]] = mul nuw nsw <16 x i32> [[TMP5]], [[TMP5]]
-; IC2-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]])
-; IC2-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP7]])
+; IC2-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
+; IC2-NEXT:    [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; IC2-NEXT:    [[TMP6:%.*]] = mul nuw nsw <16 x i32> [[TMP8]], [[TMP8]]
+; IC2-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP6]])
 ; IC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; IC2-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; IC2-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -80,18 +80,18 @@ define i32 @partial_reduce_with_non_constant_start_value(ptr %src, i32 %rdx.star
 ; IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
 ; IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; IC4-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; IC4-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; IC4-NEXT:    [[TMP13:%.*]] = mul nuw nsw <16 x i32> [[TMP9]], [[TMP9]]
+; IC4-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP13]])
 ; IC4-NEXT:    [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
-; IC4-NEXT:    [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
-; IC4-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
-; IC4-NEXT:    [[TMP10:%.*]] = mul nuw nsw <16 x i32> [[TMP6]], [[TMP6]]
 ; IC4-NEXT:    [[TMP11:%.*]] = mul nuw nsw <16 x i32> [[TMP7]], [[TMP7]]
-; IC4-NEXT:    [[TMP12:%.*]] = mul nuw nsw <16 x i32> [[TMP8]], [[TMP8]]
-; IC4-NEXT:    [[TMP13:%.*]] = mul nuw nsw <16 x i32> [[TMP9]], [[TMP9]]
-; IC4-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
 ; IC4-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP11]])
+; IC4-NEXT:    [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
+; IC4-NEXT:    [[TMP12:%.*]] = mul nuw nsw <16 x i32> [[TMP10]], [[TMP10]]
 ; IC4-NEXT:    [[PARTIAL_REDUCE8]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP12]])
-; IC4-NEXT:    [[PARTIAL_REDUCE9]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP13]])
+; IC4-NEXT:    [[TMP14:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
+; IC4-NEXT:    [[TMP16:%.*]] = mul nuw nsw <16 x i32> [[TMP14]], [[TMP14]]
+; IC4-NEXT:    [[PARTIAL_REDUCE9]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
 ; IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
 ; IC4-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; IC4-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll
index 7bb4715..6dae09e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll
@@ -18,11 +18,11 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = mul <16 x i32> [[TMP2]], [[TMP3]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = sub <16 x i32> zeroinitializer, [[TMP4]]
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -48,19 +48,19 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP7]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP14]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP14]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = mul <16 x i32> [[TMP6]], [[TMP2]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = mul <16 x i32> [[TMP15]], [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = sub <16 x i32> zeroinitializer, [[TMP8]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = sub <16 x i32> zeroinitializer, [[TMP9]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = mul <16 x i32> [[TMP4]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = sub <16 x i32> zeroinitializer, [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP15]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul <16 x i32> [[TMP8]], [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = sub <16 x i32> zeroinitializer, [[TMP10]]
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP11]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -78,27 +78,27 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    br label [[ENTRY:%.*]]
 ; CHECK-MAXBW:       vector.ph:
 ; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
 ; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-MAXBW-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[PARTIAL_REDUCE:%.*]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[PARTIAL_REDUCE:%.*]], [[FOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = mul <vscale x 8 x i32> [[TMP12]], [[TMP9]]
-; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = sub <vscale x 8 x i32> zeroinitializer, [[TMP13]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP14]])
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = mul <vscale x 16 x i32> [[TMP4]], [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP8]])
 ; CHECK-MAXBW-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]])
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
 ; CHECK-MAXBW:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
index 70532ad..46ec858 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
@@ -45,8 +45,8 @@ define i32 @zext_add_reduc_i8_i32_sve(ptr %a) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP4]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -138,8 +138,8 @@ define i32 @zext_add_reduc_i8_i32_neon(ptr %a) #2 {
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP4]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -227,8 +227,8 @@ define i64 @zext_add_reduc_i8_i64(ptr %a) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP4]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE3]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP5]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -321,8 +321,8 @@ define i64 @zext_add_reduc_i16_i64(ptr %a) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD2]] to <8 x i64>
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP4]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD2]] to <8 x i64>
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE3]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI1]], <8 x i64> [[TMP5]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -421,12 +421,12 @@ define i32 @zext_add_reduc_i8_i32_has_neon_dotprod(ptr %a) #1 {
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP5]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE8]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP10]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE9]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP7]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
 ; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -811,8 +811,8 @@ define i32 @sext_add_reduc_i8_i32(ptr %a) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP5]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -1000,11 +1000,10 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 {
 ; CHECK-INTERLEAVE1:       vector.ph:
 ; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]]
-; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0
-; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = add i32 [[D]], [[N_VEC]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[A]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0
+; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
 ; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
@@ -1012,7 +1011,8 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[D]], [[INDEX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]]
 ; CHECK-INTERLEAVE1-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP4]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
@@ -1031,23 +1031,23 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 {
 ; CHECK-INTERLEAVED:       vector.ph:
 ; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = add i32 [[D]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[A]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0
 ; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = add i32 [[D]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[A]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[D]], [[VEC_PHI1]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP4]], align 1
 ; CHECK-INTERLEAVED-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP6]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP3]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP5]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP22]] = add nuw i32 [[VEC_PHI1]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP22]], [[N_VEC]]
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
@@ -1071,11 +1071,10 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 16
 ; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], [[TMP4]]
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]]
-; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[C]], i64 0
-; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = add i32 [[D]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[A]], i32 0
-; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = zext <vscale x 16 x i8> [[BROADCAST_SPLAT]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[C]], i64 0
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
 ; CHECK-MAXBW-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
@@ -1083,6 +1082,7 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 {
 ; CHECK-MAXBW-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[D]], [[INDEX]]
 ; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]]
 ; CHECK-MAXBW-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP10]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = zext <vscale x 16 x i8> [[BROADCAST_SPLAT]] to <vscale x 16 x i32>
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP9]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]]
 ; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -1164,12 +1164,12 @@ define i64 @sext_reduction_i32_to_i64(ptr %arr, i64 %n) #1 {
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP14]], align 4
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = sext <4 x i32> [[WIDE_LOAD4]] to <4 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = sext <4 x i32> [[WIDE_LOAD5]] to <4 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = sext <4 x i32> [[WIDE_LOAD6]] to <4 x i64>
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI]], <4 x i64> [[TMP15]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = sext <4 x i32> [[WIDE_LOAD4]] to <4 x i64>
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE7]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI1]], <4 x i64> [[TMP5]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = sext <4 x i32> [[WIDE_LOAD5]] to <4 x i64>
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE8]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI2]], <4 x i64> [[TMP6]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = sext <4 x i32> [[WIDE_LOAD6]] to <4 x i64>
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE9]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI3]], <4 x i64> [[TMP7]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll
index ebf4a4f..ab1486d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll
@@ -37,7 +37,7 @@ for.end:                                          ; preds = %for.body
   ret i32 %conv27
 }
 
-attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
 !llvm.ident = !{!0}
 
 !0 = !{!"clang"}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
index 25ee100..70685c1 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
@@ -192,7 +192,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
 ; CHECK:       LV(REG): VF = 16
 ; CHECK-NEXT:  LV(REG): Found max usage: 2 item
 ; CHECK-NEXT:  LV(REG): RegisterClass: Generic::ScalarRC, 9 registers
-; CHECK-NEXT:  LV(REG): RegisterClass: Generic::VectorRC, 12 registers
+; CHECK-NEXT:  LV(REG): RegisterClass: Generic::VectorRC, 6 registers
 ; CHECK-NEXT:  LV(REG): Found invariant usage: 1 item
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
index 11cc971..fb7890a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
@@ -417,21 +417,17 @@ for.end:                                 ; preds = %for.body, %entry
 
 ; Note: This test was added to ensure we always check the legality of reductions (end emit a warning if necessary) before checking for memory dependencies
 ; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
-; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2)
+; CHECK-REMARK: Ignoring user-specified interleave count due to possibly unsafe dependencies in the loop.
+; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 1)
 define i32 @memory_dependence(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i64 %n) {
 ; CHECK-LABEL: @memory_dependence
 ; CHECK: vector.body:
 ; CHECK: %[[LOAD1:.*]] = load <4 x i32>
 ; CHECK: %[[LOAD2:.*]] = load <4 x i32>
-; CHECK: %[[LOAD3:.*]] = load <4 x i32>
-; CHECK: %[[LOAD4:.*]] = load <4 x i32>
-; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD3]], %[[LOAD1]]
-; CHECK: %[[ADD2:.*]] = add nsw <4 x i32> %[[LOAD4]], %[[LOAD2]]
-; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD3]]
-; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD4]]
+; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD2]], %[[LOAD1]]
+; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD2]]
 ; CHECK: middle.block:
-; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]]
-; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]])
+; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[MUL1]])
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index 786a2aa..28d2a27 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -1630,4 +1630,4 @@ for.body:                                         ; preds = %for.body, %entry
 
 }
 attributes #1 = { "target-features"="+sve" vscale_range(1, 16) }
-attributes #0 = { "unsafe-fp-math"="true" "target-features"="+sve" vscale_range(1, 16) }
+attributes #0 = { "target-features"="+sve" vscale_range(1, 16) }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll
index ae8dc2d..005ca8c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll
@@ -175,18 +175,28 @@ define void @test_add_double_same_var_args_1(ptr %res, ptr noalias %A, ptr noali
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP0]]
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = load <2 x double>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[STRIDED_VEC3]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[STRIDED_VEC1]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[STRIDED_VEC4]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]]
-; CHECK-NEXT:    store <2 x double> [[TMP5]], ptr [[TMP7]], align 4
-; CHECK-NEXT:    store <2 x double> [[TMP6]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -227,18 +237,28 @@ define void @test_add_double_same_var_args_2(ptr %res, ptr noalias %A, ptr noali
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP0]]
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = load <2 x double>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]]
-; CHECK-NEXT:    store <2 x double> [[TMP5]], ptr [[TMP7]], align 4
-; CHECK-NEXT:    store <2 x double> [[TMP6]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
index b23702d..2a19402 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
@@ -319,46 +319,46 @@ define void @single_fmul_used_by_each_member(ptr noalias %A, ptr noalias %B, ptr
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[INDEX]], 6
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr double, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP21]]
-; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP20]]
-; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP22]]
-; CHECK-NEXT:    [[TMP24:%.*]] = load double, ptr [[TMP23]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x double> poison, double [[TMP24]], i64 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT1]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP25:%.*]] = load double, ptr [[TMP33]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <2 x double> poison, double [[TMP25]], i64 0
-; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT12]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP26:%.*]] = load double, ptr [[TMP37]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <2 x double> poison, double [[TMP26]], i64 0
-; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT14]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP27:%.*]] = load double, ptr [[TMP39]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <2 x double> poison, double [[TMP27]], i64 0
-; CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT16]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr double, ptr [[TMP23]], i32 2
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[TMP23]], i32 4
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr double, ptr [[TMP23]], i32 6
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP23]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <2 x double>, ptr [[TMP25]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <2 x double>, ptr [[TMP26]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <2 x double>, ptr [[TMP27]], align 8
 ; CHECK-NEXT:    [[TMP28:%.*]] = fmul <2 x double> [[WIDE_LOAD]], splat (double 5.000000e+00)
 ; CHECK-NEXT:    [[TMP29:%.*]] = fmul <2 x double> [[WIDE_LOAD12]], splat (double 5.000000e+00)
 ; CHECK-NEXT:    [[TMP30:%.*]] = fmul <2 x double> [[WIDE_LOAD13]], splat (double 5.000000e+00)
 ; CHECK-NEXT:    [[TMP31:%.*]] = fmul <2 x double> [[WIDE_LOAD14]], splat (double 5.000000e+00)
 ; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP20]]
 ; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP21]]
-; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP20]]
 ; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP22]]
-; CHECK-NEXT:    store <2 x double> [[TMP28]], ptr [[TMP32]], align 8
-; CHECK-NEXT:    store <2 x double> [[TMP29]], ptr [[TMP34]], align 8
-; CHECK-NEXT:    store <2 x double> [[TMP30]], ptr [[TMP38]], align 8
-; CHECK-NEXT:    store <2 x double> [[TMP31]], ptr [[TMP35]], align 8
+; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <2 x double> [[TMP28]], <2 x double> [[TMP28]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP36]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP32]], align 8
+; CHECK-NEXT:    [[TMP37:%.*]] = shufflevector <2 x double> [[TMP29]], <2 x double> [[TMP29]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC15:%.*]] = shufflevector <4 x double> [[TMP37]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC15]], ptr [[TMP33]], align 8
+; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <2 x double> [[TMP30]], <2 x double> [[TMP30]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC16:%.*]] = shufflevector <4 x double> [[TMP38]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC16]], ptr [[TMP34]], align 8
+; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <2 x double> [[TMP31]], <2 x double> [[TMP31]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC17:%.*]] = shufflevector <4 x double> [[TMP39]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC17]], ptr [[TMP35]], align 8
 ; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP21]]
 ; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP21]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP22]]
-; CHECK-NEXT:    store <2 x double> [[TMP28]], ptr [[TMP40]], align 8
-; CHECK-NEXT:    store <2 x double> [[TMP29]], ptr [[TMP42]], align 8
-; CHECK-NEXT:    store <2 x double> [[TMP30]], ptr [[TMP41]], align 8
-; CHECK-NEXT:    store <2 x double> [[TMP31]], ptr [[TMP43]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP40]], align 8
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC15]], ptr [[TMP41]], align 8
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC16]], ptr [[TMP42]], align 8
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC17]], ptr [[TMP43]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP44]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -435,7 +435,7 @@ exit:
   ret void
 }
 
-; We should interleave by 2 after narrowing interleave groups to saturate
+; FIXME: We should interleave by 2 after narrowing interleave groups to saturate
 ; load/store units.
 define void @test_interleave_after_narrowing(i32 %n, ptr %x, ptr noalias %y) {
 ; CHECK-LABEL: define void @test_interleave_after_narrowing(
@@ -447,18 +447,12 @@ define void @test_interleave_after_narrowing(i32 %n, ptr %x, ptr noalias %y) {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = fneg <4 x float> [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP4:%.*]] = fneg <4 x float> [[WIDE_LOAD1]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[TMP5]]
 ; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[TMP2]], align 4
-; CHECK-NEXT:    store <4 x float> [[TMP4]], ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-multi-block.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-multi-block.ll
new file mode 100644
index 0000000..46b0ebd
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-multi-block.ll
@@ -0,0 +1,276 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt -p loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=VF2IC1 %s
+; RUN: opt -p loop-vectorize -force-vector-width=2 -force-vector-interleave=2 -S %s | FileCheck --check-prefixes=VF2IC2 %s
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "arm64-apple-macosx"
+
+define void @load_store_interleave_group_block_invar_cond(ptr noalias %data, ptr noalias %dst.0, ptr noalias %dst.1, i1 %c) {
+; VF2IC1-LABEL: define void @load_store_interleave_group_block_invar_cond(
+; VF2IC1-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[DST_0:%.*]], ptr noalias [[DST_1:%.*]], i1 [[C:%.*]]) {
+; VF2IC1-NEXT:  [[ENTRY:.*:]]
+; VF2IC1-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2IC1:       [[VECTOR_PH]]:
+; VF2IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2IC1:       [[VECTOR_BODY]]:
+; VF2IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE3:.*]] ]
+; VF2IC1-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]]
+; VF2IC1-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
+; VF2IC1-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2IC1-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2IC1-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2IC1-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2IC1-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
+; VF2IC1-NEXT:    br i1 [[C]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF2IC1:       [[PRED_STORE_IF]]:
+; VF2IC1-NEXT:    store i8 1, ptr [[DST_0]], align 1
+; VF2IC1-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF2IC1:       [[PRED_STORE_CONTINUE]]:
+; VF2IC1-NEXT:    br i1 [[C]], label %[[PRED_STORE_IF2:.*]], label %[[PRED_STORE_CONTINUE3]]
+; VF2IC1:       [[PRED_STORE_IF2]]:
+; VF2IC1-NEXT:    store i8 1, ptr [[DST_0]], align 1
+; VF2IC1-NEXT:    br label %[[PRED_STORE_CONTINUE3]]
+; VF2IC1:       [[PRED_STORE_CONTINUE3]]:
+; VF2IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[DST_1]], i64 [[INDEX]]
+; VF2IC1-NEXT:    store <2 x i8> zeroinitializer, ptr [[TMP2]], align 1
+; VF2IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2IC1-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2IC1-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF2IC1:       [[MIDDLE_BLOCK]]:
+; VF2IC1-NEXT:    br label %[[EXIT:.*]]
+; VF2IC1:       [[EXIT]]:
+; VF2IC1-NEXT:    ret void
+;
+; VF2IC2-LABEL: define void @load_store_interleave_group_block_invar_cond(
+; VF2IC2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[DST_0:%.*]], ptr noalias [[DST_1:%.*]], i1 [[C:%.*]]) {
+; VF2IC2-NEXT:  [[ENTRY:.*:]]
+; VF2IC2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2IC2:       [[VECTOR_PH]]:
+; VF2IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2IC2:       [[VECTOR_BODY]]:
+; VF2IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE11:.*]] ]
+; VF2IC2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 2
+; VF2IC2-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2IC2-NEXT:    [[TMP2:%.*]] = shl nsw i64 [[TMP0]], 1
+; VF2IC2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; VF2IC2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP2]]
+; VF2IC2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; VF2IC2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2IC2-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2IC2-NEXT:    [[WIDE_VEC2:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; VF2IC2-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2IC2-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2IC2-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2IC2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2IC2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
+; VF2IC2-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC3]], <2 x i64> [[STRIDED_VEC4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2IC2-NEXT:    [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2IC2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC5]], ptr [[TMP4]], align 8
+; VF2IC2-NEXT:    br i1 [[C]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF2IC2:       [[PRED_STORE_IF]]:
+; VF2IC2-NEXT:    store i8 1, ptr [[DST_0]], align 1
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF2IC2:       [[PRED_STORE_CONTINUE]]:
+; VF2IC2-NEXT:    br i1 [[C]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7:.*]]
+; VF2IC2:       [[PRED_STORE_IF6]]:
+; VF2IC2-NEXT:    store i8 1, ptr [[DST_0]], align 1
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE7]]
+; VF2IC2:       [[PRED_STORE_CONTINUE7]]:
+; VF2IC2-NEXT:    br i1 [[C]], label %[[PRED_STORE_IF8:.*]], label %[[PRED_STORE_CONTINUE9:.*]]
+; VF2IC2:       [[PRED_STORE_IF8]]:
+; VF2IC2-NEXT:    store i8 1, ptr [[DST_0]], align 1
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE9]]
+; VF2IC2:       [[PRED_STORE_CONTINUE9]]:
+; VF2IC2-NEXT:    br i1 [[C]], label %[[PRED_STORE_IF10:.*]], label %[[PRED_STORE_CONTINUE11]]
+; VF2IC2:       [[PRED_STORE_IF10]]:
+; VF2IC2-NEXT:    store i8 1, ptr [[DST_0]], align 1
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE11]]
+; VF2IC2:       [[PRED_STORE_CONTINUE11]]:
+; VF2IC2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[DST_1]], i64 [[INDEX]]
+; VF2IC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 2
+; VF2IC2-NEXT:    store <2 x i8> zeroinitializer, ptr [[TMP5]], align 1
+; VF2IC2-NEXT:    store <2 x i8> zeroinitializer, ptr [[TMP6]], align 1
+; VF2IC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF2IC2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2IC2-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF2IC2:       [[MIDDLE_BLOCK]]:
+; VF2IC2-NEXT:    br label %[[EXIT:.*]]
+; VF2IC2:       [[EXIT]]:
+; VF2IC2-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %mul.2 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %mul.2
+  %l.0 = load i64, ptr %data.0, align 8
+  store i64 %l.0, ptr %data.0, align 8
+  %add.1 = or disjoint i64 %mul.2, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %add.1
+  %l.1 = load i64, ptr %data.1, align 8
+  store i64 %l.1, ptr %data.1, align 8
+  br i1 %c, label %then, label %loop.latch
+
+then:
+  store i8 1, ptr %dst.0
+  br label %loop.latch
+
+loop.latch:
+  %gep.dst.1 = getelementptr inbounds i8, ptr %dst.1, i64 %iv
+  store i8 0, ptr %gep.dst.1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+define void @load_store_interleave_group_block_var_cond(ptr noalias %data, ptr %masks, ptr noalias %dst) {
+; VF2IC1-LABEL: define void @load_store_interleave_group_block_var_cond(
+; VF2IC1-SAME: ptr noalias [[DATA:%.*]], ptr [[MASKS:%.*]], ptr noalias [[DST:%.*]]) {
+; VF2IC1-NEXT:  [[ENTRY:.*:]]
+; VF2IC1-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2IC1:       [[VECTOR_PH]]:
+; VF2IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2IC1:       [[VECTOR_BODY]]:
+; VF2IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE3:.*]] ]
+; VF2IC1-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]]
+; VF2IC1-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
+; VF2IC1-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2IC1-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2IC1-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2IC1-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2IC1-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
+; VF2IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[INDEX]]
+; VF2IC1-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i8>, ptr [[TMP2]], align 1
+; VF2IC1-NEXT:    [[TMP3:%.*]] = icmp eq <2 x i8> [[WIDE_LOAD1]], zeroinitializer
+; VF2IC1-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0
+; VF2IC1-NEXT:    br i1 [[TMP4]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF2IC1:       [[PRED_STORE_IF]]:
+; VF2IC1-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
+; VF2IC1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[TMP5]]
+; VF2IC1-NEXT:    store i8 1, ptr [[TMP6]], align 1
+; VF2IC1-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF2IC1:       [[PRED_STORE_CONTINUE]]:
+; VF2IC1-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1
+; VF2IC1-NEXT:    br i1 [[TMP7]], label %[[PRED_STORE_IF2:.*]], label %[[PRED_STORE_CONTINUE3]]
+; VF2IC1:       [[PRED_STORE_IF2]]:
+; VF2IC1-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 1
+; VF2IC1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[TMP8]]
+; VF2IC1-NEXT:    store i8 1, ptr [[TMP9]], align 1
+; VF2IC1-NEXT:    br label %[[PRED_STORE_CONTINUE3]]
+; VF2IC1:       [[PRED_STORE_CONTINUE3]]:
+; VF2IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2IC1-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2IC1-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF2IC1:       [[MIDDLE_BLOCK]]:
+; VF2IC1-NEXT:    br label %[[EXIT:.*]]
+; VF2IC1:       [[EXIT]]:
+; VF2IC1-NEXT:    ret void
+;
+; VF2IC2-LABEL: define void @load_store_interleave_group_block_var_cond(
+; VF2IC2-SAME: ptr noalias [[DATA:%.*]], ptr [[MASKS:%.*]], ptr noalias [[DST:%.*]]) {
+; VF2IC2-NEXT:  [[ENTRY:.*:]]
+; VF2IC2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2IC2:       [[VECTOR_PH]]:
+; VF2IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2IC2:       [[VECTOR_BODY]]:
+; VF2IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE12:.*]] ]
+; VF2IC2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 2
+; VF2IC2-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2IC2-NEXT:    [[TMP2:%.*]] = shl nsw i64 [[TMP0]], 1
+; VF2IC2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; VF2IC2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP2]]
+; VF2IC2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; VF2IC2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2IC2-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2IC2-NEXT:    [[WIDE_VEC2:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; VF2IC2-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2IC2-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2IC2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2IC2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2IC2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
+; VF2IC2-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC3]], <2 x i64> [[STRIDED_VEC4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2IC2-NEXT:    [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2IC2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC5]], ptr [[TMP4]], align 8
+; VF2IC2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[INDEX]]
+; VF2IC2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 2
+; VF2IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP7]], align 1
+; VF2IC2-NEXT:    [[WIDE_LOAD6:%.*]] = load <2 x i8>, ptr [[TMP8]], align 1
+; VF2IC2-NEXT:    [[TMP9:%.*]] = icmp eq <2 x i8> [[WIDE_LOAD]], zeroinitializer
+; VF2IC2-NEXT:    [[TMP10:%.*]] = icmp eq <2 x i8> [[WIDE_LOAD6]], zeroinitializer
+; VF2IC2-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP9]], i32 0
+; VF2IC2-NEXT:    br i1 [[TMP11]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF2IC2:       [[PRED_STORE_IF]]:
+; VF2IC2-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; VF2IC2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[TMP12]]
+; VF2IC2-NEXT:    store i8 1, ptr [[TMP13]], align 1
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF2IC2:       [[PRED_STORE_CONTINUE]]:
+; VF2IC2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1
+; VF2IC2-NEXT:    br i1 [[TMP14]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF2IC2:       [[PRED_STORE_IF7]]:
+; VF2IC2-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 1
+; VF2IC2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[TMP15]]
+; VF2IC2-NEXT:    store i8 1, ptr [[TMP16]], align 1
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; VF2IC2:       [[PRED_STORE_CONTINUE8]]:
+; VF2IC2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0
+; VF2IC2-NEXT:    br i1 [[TMP17]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF2IC2:       [[PRED_STORE_IF9]]:
+; VF2IC2-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX]], 2
+; VF2IC2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[TMP18]]
+; VF2IC2-NEXT:    store i8 1, ptr [[TMP19]], align 1
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; VF2IC2:       [[PRED_STORE_CONTINUE10]]:
+; VF2IC2-NEXT:    [[TMP20:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1
+; VF2IC2-NEXT:    br i1 [[TMP20]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12]]
+; VF2IC2:       [[PRED_STORE_IF11]]:
+; VF2IC2-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 3
+; VF2IC2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[TMP21]]
+; VF2IC2-NEXT:    store i8 1, ptr [[TMP22]], align 1
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; VF2IC2:       [[PRED_STORE_CONTINUE12]]:
+; VF2IC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF2IC2-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2IC2-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF2IC2:       [[MIDDLE_BLOCK]]:
+; VF2IC2-NEXT:    br label %[[EXIT:.*]]
+; VF2IC2:       [[EXIT]]:
+; VF2IC2-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %mul.2 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %mul.2
+  %l.0 = load i64, ptr %data.0, align 8
+  store i64 %l.0, ptr %data.0, align 8
+  %add.1 = or disjoint i64 %mul.2, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %add.1
+  %l.1 = load i64, ptr %data.1, align 8
+  store i64 %l.1, ptr %data.1, align 8
+  %gep.mask = getelementptr inbounds i8, ptr %masks, i64 %iv
+  %l.mask = load i8, ptr %gep.mask
+  %c = icmp eq i8 %l.mask, 0
+  br i1 %c, label %then, label %loop.latch
+
+then:
+  store i8 1, ptr %gep.mask
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
index 2865495..c261760 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
@@ -13,8 +13,12 @@ define void @load_store_interleave_group_tc_2(ptr noalias %data) {
 ; VF2:       [[VECTOR_PH]]:
 ; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VF2:       [[VECTOR_BODY]]:
-; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[DATA]], align 8
-; VF2-NEXT:    store <2 x i64> [[WIDE_LOAD]], ptr [[DATA]], align 8
+; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[DATA]], align 8
+; VF2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[DATA]], align 8
 ; VF2-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; VF2:       [[MIDDLE_BLOCK]]:
 ; VF2-NEXT:    br label %[[EXIT:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll
index 305a692..b63e03d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll
@@ -1,81 +1,29 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 5
-; RUN: opt -p loop-vectorize -force-vector-interleave=1 -S -mcpu=neoverse-512tvb %s | FileCheck --check-prefixes=IC1 %s
-; RUN: opt -p loop-vectorize -S -mcpu=neoverse-512tvb %s | FileCheck --check-prefixes=CHECK %s
+; RUN: opt -p loop-vectorize -force-vector-interleave=1 -S -mcpu=neoverse-512tvb %s | FileCheck --check-prefixes=CHECK %s
 
 target triple = "aarch64-unknown-linux"
 
 define void @load_store_interleave_group(ptr noalias %data) {
-; IC1-LABEL: define void @load_store_interleave_group(
-; IC1-SAME: ptr noalias [[DATA:%.*]]) #[[ATTR0:[0-9]+]] {
-; IC1-NEXT:  [[ENTRY:.*:]]
-; IC1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IC1-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
-; IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP1]]
-; IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; IC1:       [[VECTOR_PH]]:
-; IC1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; IC1-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
-; IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 100, [[TMP3]]
-; IC1-NEXT:    [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]]
-; IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
-; IC1:       [[VECTOR_BODY]]:
-; IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; IC1-NEXT:    [[TMP4:%.*]] = shl nsw i64 [[INDEX]], 1
-; IC1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
-; IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP5]], align 8
-; IC1-NEXT:    store <vscale x 2 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 8
-; IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
-; IC1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IC1-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; IC1:       [[MIDDLE_BLOCK]]:
-; IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, [[N_VEC]]
-; IC1-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
-; IC1:       [[SCALAR_PH]]:
-;
 ; CHECK-LABEL: define void @load_store_interleave_group(
 ; CHECK-SAME: ptr noalias [[DATA:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 3
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP5]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 100, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP2]], 4
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP20]], 1
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP2]], 2
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP24]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 1
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP2]], 3
-; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP12]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 1
-; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP17:%.*]] = shl nsw i64 [[TMP7]], 1
-; CHECK-NEXT:    [[TMP18:%.*]] = shl nsw i64 [[TMP11]], 1
-; CHECK-NEXT:    [[TMP19:%.*]] = shl nsw i64 [[TMP15]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP18]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP19]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP21]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i64>, ptr [[TMP22]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i64>, ptr [[TMP23]], align 8
 ; CHECK-NEXT:    store <vscale x 2 x i64> [[WIDE_LOAD]], ptr [[TMP1]], align 8
-; CHECK-NEXT:    store <vscale x 2 x i64> [[WIDE_LOAD1]], ptr [[TMP21]], align 8
-; CHECK-NEXT:    store <vscale x 2 x i64> [[WIDE_LOAD2]], ptr [[TMP22]], align 8
-; CHECK-NEXT:    store <vscale x 2 x i64> [[WIDE_LOAD3]], ptr [[TMP23]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -105,82 +53,27 @@ exit:
 }
 
 define void @test_2xi64_unary_op_load_interleave_group(ptr noalias %data, ptr noalias %factor) {
-; IC1-LABEL: define void @test_2xi64_unary_op_load_interleave_group(
-; IC1-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) #[[ATTR0]] {
-; IC1-NEXT:  [[ENTRY:.*:]]
-; IC1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IC1-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
-; IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1111, [[TMP1]]
-; IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; IC1:       [[VECTOR_PH]]:
-; IC1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; IC1-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
-; IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1111, [[TMP3]]
-; IC1-NEXT:    [[N_VEC:%.*]] = sub i64 1111, [[N_MOD_VF]]
-; IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
-; IC1:       [[VECTOR_BODY]]:
-; IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; IC1-NEXT:    [[TMP4:%.*]] = shl nsw i64 [[INDEX]], 1
-; IC1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP4]]
-; IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr [[TMP5]], align 8
-; IC1-NEXT:    [[TMP6:%.*]] = fneg <vscale x 2 x double> [[WIDE_LOAD]]
-; IC1-NEXT:    store <vscale x 2 x double> [[TMP6]], ptr [[TMP5]], align 8
-; IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
-; IC1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IC1-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; IC1:       [[MIDDLE_BLOCK]]:
-; IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1111, [[N_VEC]]
-; IC1-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
-; IC1:       [[SCALAR_PH]]:
-;
 ; CHECK-LABEL: define void @test_2xi64_unary_op_load_interleave_group(
 ; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 3
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1111, [[TMP5]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1111, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1111, [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP2]], 4
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP20]], 1
-; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP2]], 2
-; CHECK-NEXT:    [[TMP28:%.*]] = add i64 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 1
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP29]]
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP2]], 3
-; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP12]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 1
-; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP17:%.*]] = shl nsw i64 [[TMP24]], 1
-; CHECK-NEXT:    [[TMP18:%.*]] = shl nsw i64 [[TMP11]], 1
-; CHECK-NEXT:    [[TMP19:%.*]] = shl nsw i64 [[TMP15]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP18]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP19]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <vscale x 2 x double>, ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x double>, ptr [[TMP21]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 2 x double>, ptr [[TMP22]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x double>, ptr [[TMP23]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = fneg <vscale x 2 x double> [[TMP7]]
-; CHECK-NEXT:    [[TMP25:%.*]] = fneg <vscale x 2 x double> [[WIDE_LOAD1]]
-; CHECK-NEXT:    [[TMP26:%.*]] = fneg <vscale x 2 x double> [[WIDE_LOAD2]]
-; CHECK-NEXT:    [[TMP27:%.*]] = fneg <vscale x 2 x double> [[WIDE_LOAD3]]
 ; CHECK-NEXT:    store <vscale x 2 x double> [[TMP9]], ptr [[TMP1]], align 8
-; CHECK-NEXT:    store <vscale x 2 x double> [[TMP25]], ptr [[TMP21]], align 8
-; CHECK-NEXT:    store <vscale x 2 x double> [[TMP26]], ptr [[TMP22]], align 8
-; CHECK-NEXT:    store <vscale x 2 x double> [[TMP27]], ptr [[TMP23]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll
index abfb44d..d290f2d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll
@@ -60,26 +60,32 @@ define void @test_2xi64_with_wide_load(ptr noalias %data, ptr noalias %factor) {
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 2
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = load <2 x i64>, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = shl nsw i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl nsw i64 [[TMP0]], 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP7]]
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[WIDE_VEC3:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC3]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <4 x i64> [[WIDE_VEC3]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT3]], [[WIDE_LOAD1]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT]], [[STRIDED_VEC2]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT3]], [[STRIDED_VEC5]]
-; CHECK-NEXT:    store <2 x i64> [[TMP15]], ptr [[TMP8]], align 8
-; CHECK-NEXT:    store <2 x i64> [[TMP16]], ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <2 x i64> [[TMP10]], <2 x i64> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP17]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <2 x i64> [[TMP11]], <2 x i64> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC6:%.*]] = shufflevector <4 x i64> [[TMP18]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x i64> [[INTERLEAVED_VEC6]], ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
index f2e689c..75980ba 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
@@ -328,8 +328,10 @@ define void @same_live_in_store_interleave_group(i64 %x, ptr noalias %dst) {
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
 ; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
-; VF2-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 8
-; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLAT]], <2 x i64> [[BROADCAST_SPLAT]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
index d4e5dea..49f663f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
@@ -23,18 +23,15 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
 ; CHECK-NEXT: <x1> vector loop: {
 ; CHECK-NEXT: vector.body:
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
-; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi vp<[[RDX_START]]>, ir<[[REDUCE:%.+]]> (VF scaled by 1/4)
+; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi vp<[[RDX_START]]>, vp<[[REDUCE:%.+]]> (VF scaled by 1/4)
 ; CHECK-NEXT:   vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]>
 ; CHECK-NEXT:   CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[STEPS]]>
 ; CHECK-NEXT:   vp<[[PTR_A:%.+]]> = vector-pointer ir<%gep.a>
 ; CHECK-NEXT:   WIDEN ir<%load.a> = load vp<[[PTR_A]]>
-; CHECK-NEXT:   WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32
 ; CHECK-NEXT:   CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[STEPS]]>
 ; CHECK-NEXT:   vp<[[PTR_B:%.+]]> = vector-pointer ir<%gep.b>
 ; CHECK-NEXT:   WIDEN ir<%load.b> = load vp<[[PTR_B]]>
-; CHECK-NEXT:   WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32
-; CHECK-NEXT:   WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a>
-; CHECK-NEXT:   PARTIAL-REDUCE ir<[[REDUCE]]> = add ir<[[ACC]]>, ir<%mul>
+; CHECK-NEXT:   EXPRESSION vp<[[REDUCE]]> = ir<[[ACC]]> + partial.reduce.add (mul (ir<%load.b> zext to i32), (ir<%load.a> zext to i32))
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:   EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
 ; CHECK-NEXT: No successors
@@ -42,7 +39,7 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
 ; CHECK-NEXT: Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
-; CHECK-NEXT:   EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, ir<[[REDUCE]]>
+; CHECK-NEXT:   EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, vp<[[REDUCE]]>
 ; CHECK-NEXT:   EMIT vp<[[CMP:%.+]]> = icmp eq ir<1024>, vp<[[VEC_TC]]>
 ; CHECK-NEXT:   EMIT branch-on-cond vp<[[CMP]]>
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
@@ -89,10 +86,10 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
 ; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<%accum> = phi vp<[[RDX_START]]>, ir<%add> (VF scaled by 1/4)
 ; CHECK-NEXT:   CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[EP_IV]]>
 ; CHECK-NEXT:   WIDEN ir<%load.a> = load ir<%gep.a>
-; CHECK-NEXT:   WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32
 ; CHECK-NEXT:   CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[EP_IV]]>
 ; CHECK-NEXT:   WIDEN ir<%load.b> = load ir<%gep.b>
 ; CHECK-NEXT:   WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32
+; CHECK-NEXT:   WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32
 ; CHECK-NEXT:   WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a>
 ; CHECK-NEXT:   PARTIAL-REDUCE ir<%add> = add ir<%accum>, ir<%mul>
 ; CHECK-NEXT:   EMIT vp<[[EP_IV_NEXT:%.+]]> = add nuw vp<[[EP_IV]]>, ir<16>
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll b/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
index 0f398a6..2579918 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
@@ -327,5 +327,5 @@ for.end:                                          ; preds = %for.body, %entry
 
 declare float @fabsf(float)
 
-attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll
index 0a9b1e0..61e3a18 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll
@@ -60,10 +60,10 @@ define i32 @vqdot(ptr %a, ptr %b) #0 {
 ; ZVQDOTQ-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 1 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; ZVQDOTQ-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP6]], align 1
-; ZVQDOTQ-NEXT:    [[TMP8:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; ZVQDOTQ-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; ZVQDOTQ-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
 ; ZVQDOTQ-NEXT:    [[TMP11:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; ZVQDOTQ-NEXT:    [[TMP8:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; ZVQDOTQ-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
 ; ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]])
 ; ZVQDOTQ-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
@@ -125,17 +125,17 @@ define i32 @vqdot(ptr %a, ptr %b) #0 {
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
-; FIXED-ZVQDOTQ-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
-; FIXED-ZVQDOTQ-NEXT:    [[TMP4:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP8:%.*]] = sext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
-; FIXED-ZVQDOTQ-NEXT:    [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]]
-; FIXED-ZVQDOTQ-NEXT:    [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]]
 ; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]])
+; FIXED-ZVQDOTQ-NEXT:    [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP12:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP12]]
 ; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]])
 ; FIXED-ZVQDOTQ-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -222,10 +222,10 @@ define i32 @vqdotu(ptr %a, ptr %b) #0 {
 ; ZVQDOTQ-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 1 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; ZVQDOTQ-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP6]], align 1
-; ZVQDOTQ-NEXT:    [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; ZVQDOTQ-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; ZVQDOTQ-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
 ; ZVQDOTQ-NEXT:    [[TMP11:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; ZVQDOTQ-NEXT:    [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; ZVQDOTQ-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
 ; ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]])
 ; ZVQDOTQ-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
@@ -287,17 +287,17 @@ define i32 @vqdotu(ptr %a, ptr %b) #0 {
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
-; FIXED-ZVQDOTQ-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
-; FIXED-ZVQDOTQ-NEXT:    [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP8:%.*]] = zext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
-; FIXED-ZVQDOTQ-NEXT:    [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]]
-; FIXED-ZVQDOTQ-NEXT:    [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]]
 ; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]])
+; FIXED-ZVQDOTQ-NEXT:    [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP12:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP12]]
 ; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]])
 ; FIXED-ZVQDOTQ-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -384,10 +384,10 @@ define i32 @vqdotsu(ptr %a, ptr %b) #0 {
 ; ZVQDOTQ-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 1 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; ZVQDOTQ-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP6]], align 1
-; ZVQDOTQ-NEXT:    [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; ZVQDOTQ-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; ZVQDOTQ-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
 ; ZVQDOTQ-NEXT:    [[TMP11:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; ZVQDOTQ-NEXT:    [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; ZVQDOTQ-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
 ; ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]])
 ; ZVQDOTQ-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
@@ -449,18 +449,18 @@ define i32 @vqdotsu(ptr %a, ptr %b) #0 {
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
-; FIXED-ZVQDOTQ-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
-; FIXED-ZVQDOTQ-NEXT:    [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
-; FIXED-ZVQDOTQ-NEXT:    [[TMP8:%.*]] = sext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
-; FIXED-ZVQDOTQ-NEXT:    [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
-; FIXED-ZVQDOTQ-NEXT:    [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]]
+; FIXED-ZVQDOTQ-NEXT:    [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]]
-; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]])
-; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]])
+; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP11]])
+; FIXED-ZVQDOTQ-NEXT:    [[TMP10:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP8:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP14:%.*]] = mul <8 x i32> [[TMP10]], [[TMP8]]
+; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP14]])
 ; FIXED-ZVQDOTQ-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-ZVQDOTQ-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -545,10 +545,10 @@ define i32 @vqdotsu2(ptr %a, ptr %b) #0 {
 ; ZVQDOTQ-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 1 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; ZVQDOTQ-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP6]], align 1
-; ZVQDOTQ-NEXT:    [[TMP8:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; ZVQDOTQ-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; ZVQDOTQ-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
 ; ZVQDOTQ-NEXT:    [[TMP11:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; ZVQDOTQ-NEXT:    [[TMP8:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; ZVQDOTQ-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
 ; ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]])
 ; ZVQDOTQ-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
@@ -610,18 +610,18 @@ define i32 @vqdotsu2(ptr %a, ptr %b) #0 {
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
-; FIXED-ZVQDOTQ-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
-; FIXED-ZVQDOTQ-NEXT:    [[TMP4:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
-; FIXED-ZVQDOTQ-NEXT:    [[TMP8:%.*]] = zext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
-; FIXED-ZVQDOTQ-NEXT:    [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
-; FIXED-ZVQDOTQ-NEXT:    [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]]
+; FIXED-ZVQDOTQ-NEXT:    [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP4:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]]
-; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]])
-; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]])
+; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP11]])
+; FIXED-ZVQDOTQ-NEXT:    [[TMP10:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP8:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP14:%.*]] = mul <8 x i32> [[TMP10]], [[TMP8]]
+; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP14]])
 ; FIXED-ZVQDOTQ-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-ZVQDOTQ-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
index e42e2c7..b26e9cf 100644
--- a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
@@ -1779,7 +1779,7 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: LV: Scalar loop costs: 24
 ; CHECK: LV: Vector loop of width 2 costs: 33
 ; CHECK: LV: Vector loop of width 4 costs: 30
-; CHECK: LV: Selecting VF: 4
+; CHECK: LV: Selecting VF: 1
 define hidden void @four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp45.not = icmp eq i32 %N, 0
diff --git a/llvm/test/Transforms/LoopVectorize/WebAssembly/partial-reduce-accumulate.ll b/llvm/test/Transforms/LoopVectorize/WebAssembly/partial-reduce-accumulate.ll
new file mode 100644
index 0000000..2338da5
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/WebAssembly/partial-reduce-accumulate.ll
@@ -0,0 +1,126 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -mattr=+simd128 -passes=loop-vectorize %s -S | FileCheck %s
+; RUN: opt -mattr=+simd128 -passes=loop-vectorize -vectorizer-maximize-bandwidth %s -S | FileCheck %s --check-prefix=CHECK-MAX-BANDWIDTH
+
+target triple = "wasm32"
+
+define hidden i32 @accumulate_add_u8_u8(ptr noundef readonly  %a, ptr noundef readonly  %b, i32 noundef %N) {
+; CHECK-LABEL: define hidden i32 @accumulate_add_u8_u8(
+; CHECK-SAME: ptr noundef readonly [[A:%.*]], ptr noundef readonly [[B:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i32 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i32 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i8> [[WIDE_LOAD1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i32> [[VEC_PHI]], [[TMP3]]
+; CHECK-NEXT:    [[TMP8]] = add <4 x i32> [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[ADD3:%.*]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[RED:%.*]] = phi i32 [ [[ADD3]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i32 [[IV]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP11]] to i32
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i32 [[IV]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CONV2:%.*]] = zext i8 [[TMP12]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[RED]], [[CONV]]
+; CHECK-NEXT:    [[ADD3]] = add i32 [[ADD]], [[CONV2]]
+; CHECK-NEXT:    [[INC]] = add nuw i32 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+; CHECK-MAX-BANDWIDTH-LABEL: define hidden i32 @accumulate_add_u8_u8(
+; CHECK-MAX-BANDWIDTH-SAME: ptr noundef readonly [[A:%.*]], ptr noundef readonly [[B:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-MAX-BANDWIDTH-NEXT:  [[ENTRY:.*]]:
+; CHECK-MAX-BANDWIDTH-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 16
+; CHECK-MAX-BANDWIDTH-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-MAX-BANDWIDTH:       [[VECTOR_PH]]:
+; CHECK-MAX-BANDWIDTH-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 16
+; CHECK-MAX-BANDWIDTH-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; CHECK-MAX-BANDWIDTH-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-MAX-BANDWIDTH:       [[VECTOR_BODY]]:
+; CHECK-MAX-BANDWIDTH-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-MAX-BANDWIDTH-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-MAX-BANDWIDTH-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i32 [[INDEX]]
+; CHECK-MAX-BANDWIDTH-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-MAX-BANDWIDTH-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i32 [[INDEX]]
+; CHECK-MAX-BANDWIDTH-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-MAX-BANDWIDTH-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-MAX-BANDWIDTH-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
+; CHECK-MAX-BANDWIDTH-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-MAX-BANDWIDTH-NEXT:    [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP6]])
+; CHECK-MAX-BANDWIDTH-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
+; CHECK-MAX-BANDWIDTH-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAX-BANDWIDTH-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-MAX-BANDWIDTH:       [[MIDDLE_BLOCK]]:
+; CHECK-MAX-BANDWIDTH-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE2]])
+; CHECK-MAX-BANDWIDTH-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; CHECK-MAX-BANDWIDTH-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; CHECK-MAX-BANDWIDTH:       [[SCALAR_PH]]:
+; CHECK-MAX-BANDWIDTH-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-MAX-BANDWIDTH-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-MAX-BANDWIDTH-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-MAX-BANDWIDTH:       [[FOR_COND_CLEANUP]]:
+; CHECK-MAX-BANDWIDTH-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[ADD3:%.*]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
+; CHECK-MAX-BANDWIDTH-NEXT:    ret i32 [[RESULT_0_LCSSA]]
+; CHECK-MAX-BANDWIDTH:       [[FOR_BODY]]:
+; CHECK-MAX-BANDWIDTH-NEXT:    [[IV:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-MAX-BANDWIDTH-NEXT:    [[RED:%.*]] = phi i32 [ [[ADD3]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-MAX-BANDWIDTH-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i32 [[IV]]
+; CHECK-MAX-BANDWIDTH-NEXT:    [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-MAX-BANDWIDTH-NEXT:    [[CONV:%.*]] = zext i8 [[TMP11]] to i32
+; CHECK-MAX-BANDWIDTH-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i32 [[IV]]
+; CHECK-MAX-BANDWIDTH-NEXT:    [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-MAX-BANDWIDTH-NEXT:    [[CONV2:%.*]] = zext i8 [[TMP12]] to i32
+; CHECK-MAX-BANDWIDTH-NEXT:    [[ADD:%.*]] = add i32 [[RED]], [[CONV]]
+; CHECK-MAX-BANDWIDTH-NEXT:    [[ADD3]] = add i32 [[ADD]], [[CONV2]]
+; CHECK-MAX-BANDWIDTH-NEXT:    [[INC]] = add nuw i32 [[IV]], 1
+; CHECK-MAX-BANDWIDTH-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-MAX-BANDWIDTH-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret i32 %add3
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %red = phi i32 [ %add3, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw i8, ptr %a, i32 %iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %arrayidx1 = getelementptr inbounds nuw i8, ptr %b, i32 %iv
+  %1 = load i8, ptr %arrayidx1, align 1
+  %conv2 = zext i8 %1 to i32
+  %add = add i32 %red, %conv
+  %add3 = add i32 %add, %conv2
+  %inc = add nuw i32 %iv, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
index 9168ebf..08d39ea 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
@@ -214,7 +214,7 @@ for.end15:                                        ; preds = %for.end.us, %entry
   ret void
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
 
 !3 = !{!4, !5}
 !4 = !{!4}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/int128_no_gather.ll b/llvm/test/Transforms/LoopVectorize/X86/int128_no_gather.ll
index 44e9d3e..b7f8be1 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/int128_no_gather.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/int128_no_gather.ll
@@ -71,6 +71,6 @@ declare i32 @printf(ptr, ...) #1
 ; Function Attrs: nounwind
 declare i32 @puts(ptr nocapture readonly) #2
 
-attributes #0 = { noinline nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "use-soft-float"="false" }
+attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "use-soft-float"="false" }
 attributes #2 = { nounwind }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
index f066000..52e90e4 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
@@ -362,4 +362,4 @@ for.body:
   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 }
 
-attributes #0 = { "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "unsafe-fp-math"="false" }
+attributes #0 = { "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
index 005696a..e405fe7 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
@@ -335,4 +335,4 @@ for.body:                                         ; preds = %for.body.preheader,
   br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit99
 }
 
-attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="broadwell" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512f,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+evex512,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-vzeroupper" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="broadwell" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512f,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+evex512,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-vzeroupper" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll b/llvm/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll
index b98a2ea..c7550ca 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll
@@ -143,7 +143,7 @@ for.inc:
   br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !11
 }
 
-attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
index f4d80af..2a3ce03 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
@@ -8,69 +8,15 @@ target triple = "x86_64-unknown-linux"
 define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
 ; CHECK-LABEL: define void @test_4xi64(
 ; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  [[ITER_CHECK:.*]]:
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
-; CHECK:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 16
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[N_MOD_VF1:%.*]] = urem i64 [[N]], 16
-; CHECK-NEXT:    [[N_VEC1:%.*]] = sub i64 [[N]], [[N_MOD_VF1]]
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT1:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP20]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP7]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP21]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i64> poison, i64 [[TMP8]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT5]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP22]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i64> poison, i64 [[TMP9]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT7]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x i64> poison, i64 [[TMP10]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT9]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[INDEX]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP23]], align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT1]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    [[TMP16:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT6]], [[WIDE_LOAD2]]
-; CHECK-NEXT:    [[TMP17:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT8]], [[WIDE_LOAD3]]
-; CHECK-NEXT:    [[TMP18:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT10]], [[WIDE_LOAD4]]
-; CHECK-NEXT:    store <4 x i64> [[TMP15]], ptr [[TMP11]], align 8
-; CHECK-NEXT:    store <4 x i64> [[TMP16]], ptr [[TMP12]], align 8
-; CHECK-NEXT:    store <4 x i64> [[TMP17]], ptr [[TMP13]], align 8
-; CHECK-NEXT:    store <4 x i64> [[TMP18]], ptr [[TMP23]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT1]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC1]]
-; CHECK-NEXT:    br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[CMP_N1:%.*]] = icmp eq i64 [[N]], [[N_VEC1]]
-; CHECK-NEXT:    br i1 [[CMP_N1]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
-; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF1]], 4
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
-; CHECK:       [[VEC_EPILOG_PH]]:
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC1]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
-; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP5]], i64 0
@@ -81,15 +27,15 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 1
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP14]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
-; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC1]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[DATA_2:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
 ; CHECK-NEXT:    [[L_2:%.*]] = load i64, ptr [[DATA_2]], align 8
 ; CHECK-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 0
@@ -110,7 +56,7 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    store i64 [[MUL_3]], ptr [[DATA_3]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -171,7 +117,7 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -194,7 +140,7 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -249,7 +195,7 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no
 ; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -272,7 +218,7 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no
 ; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -327,7 +273,7 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa
 ; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -350,7 +296,7 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa
 ; CHECK-NEXT:    store i64 [[MUL_0]], ptr [[DATA_1]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -405,7 +351,7 @@ define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %fa
 ; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -428,7 +374,7 @@ define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %fa
 ; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_0]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -489,7 +435,7 @@ define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noal
 ; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -513,7 +459,7 @@ define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noal
 ; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -573,7 +519,7 @@ define void @test_3xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -598,7 +544,7 @@ define void @test_3xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    store i64 [[MUL_2]], ptr [[DATA_2]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -707,7 +653,7 @@ define void @test_3xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -731,7 +677,7 @@ define void @test_3xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    store i32 [[MUL_2]], ptr [[DATA_2]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -765,20 +711,19 @@ exit:
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
-; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]}
-; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
-; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]}
-; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]}
-; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META1]]}
-; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]}
-; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META2]], [[META1]]}
-; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]]}
-; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META2]], [[META1]]}
-; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]], [[META2]]}
-; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META2]], [[META1]]}
-; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]], [[META2]]}
-; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META2]], [[META1]]}
-; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]}
-; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META2]], [[META1]]}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
+; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
+; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll
index 2c187ca..9471e4a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll
@@ -72,7 +72,7 @@ for.end:                                          ; preds = %for.body, %entry
   ret void, !dbg !27
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8}
diff --git a/llvm/test/Transforms/LoopVectorize/constantfolder.ll b/llvm/test/Transforms/LoopVectorize/constantfolder.ll
index 66592b0..fdeb497 100644
--- a/llvm/test/Transforms/LoopVectorize/constantfolder.ll
+++ b/llvm/test/Transforms/LoopVectorize/constantfolder.ll
@@ -288,3 +288,72 @@ loop.latch:
 exit:
   ret void
 }
+
+define void @const_fold_binaryintrinsic(ptr %dst, i64 %d) {
+; CHECK-LABEL: define void @const_fold_binaryintrinsic(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[D:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    store i64 3, ptr [[DST]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %const.0 = xor i64 %d, %d
+  %trunc = call i64 @llvm.umax.i64(i64 %const.0, i64 3)
+  store i64 %trunc, ptr %dst, align 2
+  %iv.next = add i64 %iv, 1
+  %cmp = icmp ult i64 %iv.next, 100
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @const_fold_widegep(ptr noalias %A, ptr noalias %B, i64 %d) {
+; CHECK-LABEL: define void @const_fold_widegep(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[D:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    store ptr [[A]], ptr [[B]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; CHECK-NEXT:    br i1 [[TMP0]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %const.0 = xor i64 %d, %d
+  %gep.A = getelementptr i64, ptr %A, i64 %const.0
+  %gep.B = getelementptr i64, ptr %B, i64 %const.0
+  store ptr %gep.A, ptr %gep.B
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exit.cond = icmp ult i64 %iv.next, 100
+  br i1 %exit.cond, label %loop, label %exit
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/diag-missing-instr-debug-loc.ll b/llvm/test/Transforms/LoopVectorize/diag-missing-instr-debug-loc.ll
index 8ce2a97..0656de4 100644
--- a/llvm/test/Transforms/LoopVectorize/diag-missing-instr-debug-loc.ll
+++ b/llvm/test/Transforms/LoopVectorize/diag-missing-instr-debug-loc.ll
@@ -50,7 +50,7 @@ for.body:                                         ; preds = %for.body.preheader,
   br i1 %exitcond, label %for.cond.cleanup, label %for.body, !dbg !9, !llvm.loop !18
 }
 
-attributes #0 = { norecurse nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
diff --git a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll
index 65bd36c..5e51624f 100644
--- a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll
@@ -134,7 +134,7 @@ for.cond.cleanup:
   ret void, !dbg !44
 }
 
-attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll
index 4b7b714..77ec95a 100644
--- a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll
+++ b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll
@@ -145,7 +145,7 @@ for.body:                                         ; preds = %entry, %for.body
   br i1 %exitcond, label %for.cond.cleanup, label %for.body, !dbg !43, !llvm.loop !55
 }
 
-attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll b/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll
index 5cf99b8..3487331 100644
--- a/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll
+++ b/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll
@@ -135,7 +135,7 @@ thread-pre-split5:                                ; preds = %.lr.ph
   ret void
 }
 
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
index 8efe29a..fc2e233 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -783,7 +783,7 @@ for.body:                                         ; preds = %for.body, %entry
 @SA = common global i32 0, align 4
 @SB = common global float 0.000000e+00, align 4
 
-define void @int_float_struct(ptr nocapture readonly %A) #0 {
+define void @int_float_struct(ptr nocapture readonly %A) {
 ; CHECK-LABEL: @int_float_struct(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
@@ -1546,5 +1546,3 @@ loop:
 end:
   ret void
 }
-
-attributes #0 = { "unsafe-fp-math"="true" }
diff --git a/llvm/test/Transforms/LoopVectorize/metadata-width.ll b/llvm/test/Transforms/LoopVectorize/metadata-width.ll
index ddf9029..22243d9 100644
--- a/llvm/test/Transforms/LoopVectorize/metadata-width.ll
+++ b/llvm/test/Transforms/LoopVectorize/metadata-width.ll
@@ -89,7 +89,7 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
 
 !0 = !{!0, !1, !5}
 !1 = !{!"llvm.loop.vectorize.width", i32 8}
diff --git a/llvm/test/Transforms/LoopVectorize/multiple-address-spaces.ll b/llvm/test/Transforms/LoopVectorize/multiple-address-spaces.ll
index a1fc1b8..87b5a0b 100644
--- a/llvm/test/Transforms/LoopVectorize/multiple-address-spaces.ll
+++ b/llvm/test/Transforms/LoopVectorize/multiple-address-spaces.ll
@@ -56,4 +56,4 @@ for.end:                                          ; preds = %for.body
   ret i32 0
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-address-space.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-address-space.ll
index 705bbab..64ae730 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check-address-space.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-address-space.ll
@@ -218,4 +218,4 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll
index 1effb10..872a3929 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll
@@ -129,4 +129,4 @@ for.end:                                          ; preds = %for.body
   ret void
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll b/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll
index 8224d6b..137e098 100644
--- a/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll
@@ -104,8 +104,8 @@ for.end26:                                        ; preds = %for.cond4.for.end26
 }
 declare i32 @fn2(double) #1
 
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
 
 !0 = !{!"int", !1}
 !1 = !{!"omnipotent char", !2}
diff --git a/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll b/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll
new file mode 100644
index 0000000..01934b1
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll
@@ -0,0 +1,30 @@
+; RUN: opt -passes=loop-vectorize -pass-remarks-analysis=loop-vectorize -S < %s 2>&1 | FileCheck %s
+
+; Make sure the unsafe user specified interleave count is ignored.
+
+; CHECK: remark: <unknown>:0:0: Ignoring user-specified interleave count due to possibly unsafe dependencies in the loop.
+; CHECK-LABEL: @loop_distance_4
+define void @loop_distance_4(ptr %a, ptr %b) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 4, %entry ], [ %iv.next, %loop ]
+  %0 = getelementptr i32, ptr %b, i64 %iv
+  %arrayidx = getelementptr i8, ptr %0, i64 -16
+  %1 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds nuw i32, ptr %a, i64 %iv
+  %2 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %2, %1
+  store i32 %add, ptr %0, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 64
+  br i1 %exitcond.not, label %for.end, label %loop, !llvm.loop !1
+
+for.end:
+  ret void
+}
+
+!1 = !{!1, !2, !3}
+!2 = !{!"llvm.loop.interleave.count", i32 4}
+!3 = !{!"llvm.loop.vectorize.width", i32 4}
diff --git a/llvm/test/Transforms/MergeICmps/X86/int64-and-ptr.ll b/llvm/test/Transforms/MergeICmps/X86/int64-and-ptr.ll
index e2f5007..cc187de 100644
--- a/llvm/test/Transforms/MergeICmps/X86/int64-and-ptr.ll
+++ b/llvm/test/Transforms/MergeICmps/X86/int64-and-ptr.ll
@@ -35,11 +35,7 @@ if.end5:                                          ; preds = %if.then, %entry
   ret i1 %rez.0
 }
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
-
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind }
+declare void @llvm.lifetime.end.p0(ptr nocapture)
diff --git a/llvm/test/Transforms/MergeICmps/X86/pr41917.ll b/llvm/test/Transforms/MergeICmps/X86/pr41917.ll
index d4fd34a..d8065e8 100644
--- a/llvm/test/Transforms/MergeICmps/X86/pr41917.ll
+++ b/llvm/test/Transforms/MergeICmps/X86/pr41917.ll
@@ -7,13 +7,13 @@ target triple = "i386-pc-windows-msvc19.11.0"
 %class.a = type { i32, i32, i32, i32, i32 }
 
 ; Function Attrs: nounwind optsize
-define dso_local zeroext i1 @pr41917(ptr byval(%class.a) nocapture readonly align 4 %g, ptr byval(%class.a) nocapture readonly align 4 %p2) local_unnamed_addr #0 {
+define dso_local zeroext i1 @pr41917(ptr byval(%class.a) nocapture readonly align 4 %g, ptr byval(%class.a) nocapture readonly align 4 %p2) local_unnamed_addr {
 ; CHECK-LABEL: @pr41917(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call zeroext i1 @f2() #[[ATTR3:[0-9]+]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call zeroext i1 @f2()
 ; CHECK-NEXT:    br i1 [[CALL]], label [[LAND_RHS:%.*]], label %"land.end+land.rhs3"
 ; CHECK:       land.rhs:
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call zeroext i1 @f2() #[[ATTR3]]
+; CHECK-NEXT:    [[CALL1:%.*]] = tail call zeroext i1 @f2()
 ; CHECK-NEXT:    br label %"land.end+land.rhs3"
 ; CHECK:       "land.end+land.rhs3":
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[CLASS_A:%.*]], ptr [[G:%.*]], i32 0, i32 1
@@ -25,11 +25,11 @@ define dso_local zeroext i1 @pr41917(ptr byval(%class.a) nocapture readonly alig
 ; CHECK-NEXT:    ret i1 [[TMP2]]
 ;
 entry:
-  %call = tail call zeroext i1 @f2() #2
+  %call = tail call zeroext i1 @f2()
   br i1 %call, label %land.rhs, label %land.end
 
 land.rhs:                                         ; preds = %entry
-  %call1 = tail call zeroext i1 @f2() #2
+  %call1 = tail call zeroext i1 @f2()
   br label %land.end
 
 land.end:                                         ; preds = %land.rhs, %entry
@@ -53,11 +53,7 @@ land.end6:                                        ; preds = %land.rhs3, %land.en
   ret i1 %4
 }
 
-declare dso_local zeroext i1 @f2() local_unnamed_addr #1
-
-attributes #0 = { nounwind optsize "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { optsize "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind optsize }
+declare dso_local zeroext i1 @f2() local_unnamed_addr
 
 !llvm.module.flags = !{!0, !1}
 !llvm.ident = !{!2}
diff --git a/llvm/test/Transforms/NewGVN/basic-cyclic-opt.ll b/llvm/test/Transforms/NewGVN/basic-cyclic-opt.ll
index 1cf9fd9..12b4184 100644
--- a/llvm/test/Transforms/NewGVN/basic-cyclic-opt.ll
+++ b/llvm/test/Transforms/NewGVN/basic-cyclic-opt.ll
@@ -4,7 +4,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
 ;; Function Attrs: nounwind ssp uwtable
 ;; We should eliminate the sub, and one of the phi nodes
-define void @vnum_test1(ptr %data) #0 {
+define void @vnum_test1(ptr %data) {
 ; CHECK-LABEL: @vnum_test1(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 3
@@ -74,7 +74,7 @@ bb19:                                             ; preds = %bb4
 ;; We should eliminate the sub, one of the phi nodes, prove the store of the sub
 ;; and the load of data are equivalent, that the load always produces constant 0, and
 ;; delete the load replacing it with constant 0.
-define i32 @vnum_test2(ptr %data) #0 {
+define i32 @vnum_test2(ptr %data) {
 ; CHECK-LABEL: @vnum_test2(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 3
@@ -142,11 +142,10 @@ bb21:                                             ; preds = %bb4
   ret i32 %p.0
 }
 
-
 ; Function Attrs: nounwind ssp uwtable
 ;; Same as test 2, with a conditional store of m-n, so it has to also discover
 ;; that data ends up with the same value no matter what branch is taken.
-define i32 @vnum_test3(ptr %data) #0 {
+define i32 @vnum_test3(ptr %data) {
 ; CHECK-LABEL: @vnum_test3(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 3
@@ -305,7 +304,6 @@ bb3:                                              ; preds = %bb2
   %tmp3 = sub i32 %tmp, %phi2
   ret i32 %tmp3
 }
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 !llvm.ident = !{!0, !0, !0}
 
diff --git a/llvm/test/Transforms/NewGVN/cond_br2-xfail.ll b/llvm/test/Transforms/NewGVN/cond_br2-xfail.ll
index 8b2d662..b6da86b 100644
--- a/llvm/test/Transforms/NewGVN/cond_br2-xfail.ll
+++ b/llvm/test/Transforms/NewGVN/cond_br2-xfail.ll
@@ -10,7 +10,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 %"union.llvm::SmallVectorBase::U" = type { x86_fp80 }
 
 ; Function Attrs: ssp uwtable
-define void @_Z4testv() #0 personality ptr @__gxx_personality_v0 {
+define void @_Z4testv() personality ptr @__gxx_personality_v0 {
 ; CHECK: @_Z4testv()
 ; CHECK: invoke.cont:
 ; CHECK: br i1 true, label %new.notnull.i11, label %if.end.i14
@@ -18,7 +18,7 @@ define void @_Z4testv() #0 personality ptr @__gxx_personality_v0 {
 
 entry:
   %sv = alloca %"class.llvm::SmallVector", align 16
-  call void @llvm.lifetime.start.p0(ptr %sv) #1
+  call void @llvm.lifetime.start.p0(ptr %sv)
   %FirstEl.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector", ptr %sv, i64 0, i32 0, i32 0, i32 0, i32 0, i32 3
   store ptr %FirstEl.i.i.i.i.i.i, ptr %sv, align 16, !tbaa !4
   %EndX.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector", ptr %sv, i64 0, i32 0, i32 0, i32 0, i32 0, i32 1
@@ -83,11 +83,11 @@ invoke.cont3:                                     ; preds = %invoke.cont2
   br i1 %cmp.i.i.i.i19, label %_ZN4llvm11SmallVectorIiLj8EED1Ev.exit21, label %if.then.i.i.i20
 
 if.then.i.i.i20:                                  ; preds = %invoke.cont3
-  call void @free(ptr %5) #1
+  call void @free(ptr %5)
   br label %_ZN4llvm11SmallVectorIiLj8EED1Ev.exit21
 
 _ZN4llvm11SmallVectorIiLj8EED1Ev.exit21:          ; preds = %invoke.cont3, %if.then.i.i.i20
-  call void @llvm.lifetime.end.p0(ptr %sv) #1
+  call void @llvm.lifetime.end.p0(ptr %sv)
   ret void
 
 lpad:                                             ; preds = %if.end.i14, %if.end.i, %invoke.cont2
@@ -98,7 +98,7 @@ lpad:                                             ; preds = %if.end.i14, %if.end
   br i1 %cmp.i.i.i.i, label %eh.resume, label %if.then.i.i.i
 
 if.then.i.i.i:                                    ; preds = %lpad
-  call void @free(ptr %7) #1
+  call void @free(ptr %7)
   br label %eh.resume
 
 eh.resume:                                        ; preds = %if.then.i.i.i, %lpad
@@ -106,24 +106,19 @@ eh.resume:                                        ; preds = %if.then.i.i.i, %lpa
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture)
 
 declare i32 @__gxx_personality_v0(...)
 
-declare void @_Z1gRN4llvm11SmallVectorIiLj8EEE(ptr) #2
+declare void @_Z1gRN4llvm11SmallVectorIiLj8EEE(ptr)
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(ptr nocapture)
 
-declare void @_ZN4llvm15SmallVectorBase8grow_podEmm(ptr, i64, i64) #2
+declare void @_ZN4llvm15SmallVectorBase8grow_podEmm(ptr, i64, i64)
 
 ; Function Attrs: nounwind
-declare void @free(ptr nocapture) #3
-
-attributes #0 = { ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind }
-attributes #2 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+declare void @free(ptr nocapture)
 
 !0 = !{!"any pointer", !1}
 !1 = !{!"omnipotent char", !2}
diff --git a/llvm/test/Transforms/NewGVN/equivalent-phi.ll b/llvm/test/Transforms/NewGVN/equivalent-phi.ll
index ba4fc14..388b941 100644
--- a/llvm/test/Transforms/NewGVN/equivalent-phi.ll
+++ b/llvm/test/Transforms/NewGVN/equivalent-phi.ll
@@ -8,7 +8,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 ;; one set of indexing calculations and a load
 
 ; Function Attrs: nounwind ssp uwtable
-define i32 @bar(i32 %arg, i32 %arg1, i32 %arg2) #0 {
+define i32 @bar(i32 %arg, i32 %arg1, i32 %arg2) {
 ; CHECK-LABEL: @bar(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    br label [[BB3:%.*]]
@@ -59,8 +59,6 @@ bb20:                                             ; preds = %bb17
   ret i32 %tmp14
 }
 
-attributes #0 = { nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
 
diff --git a/llvm/test/Transforms/NewGVN/memory-handling.ll b/llvm/test/Transforms/NewGVN/memory-handling.ll
index f83d145..71e9041 100644
--- a/llvm/test/Transforms/NewGVN/memory-handling.ll
+++ b/llvm/test/Transforms/NewGVN/memory-handling.ll
@@ -282,10 +282,10 @@ declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1) #2
 ; Function Attrs: inlinehint nounwind readonly uwtable
 declare i32 @tolower(i32) local_unnamed_addr #3
 
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable }
+attributes #1 = { nounwind readnone }
 attributes #2 = { argmemonly nounwind }
-attributes #3 = { inlinehint nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { inlinehint nounwind readonly uwtable }
 attributes #4 = { nounwind readnone }
 attributes #5 = { nounwind readonly }
 
diff --git a/llvm/test/Transforms/NewGVN/pr31483.ll b/llvm/test/Transforms/NewGVN/pr31483.ll
index 82e9a2a..c1fb836 100644
--- a/llvm/test/Transforms/NewGVN/pr31483.ll
+++ b/llvm/test/Transforms/NewGVN/pr31483.ll
@@ -6,7 +6,7 @@ target datalayout = "E-m:e-i64:64-n32:64"
 ;; Ensure we do not believe the indexing increments are unreachable due to incorrect memory
 ;; equivalence detection.  In PR31483, we were deleting those blocks as unreachable
 ; Function Attrs: nounwind
-define signext i32 @ham(ptr %arg, ptr %arg1) #0 {
+define signext i32 @ham(ptr %arg, ptr %arg1) {
 ; CHECK-LABEL: @ham(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
@@ -89,12 +89,7 @@ bb23:                                             ; preds = %bb2
   ret i32 undef
 }
 
-declare signext i32 @zot(ptr, ...) #1
+declare signext i32 @zot(ptr, ...)
 
 ; Function Attrs: nounwind
-declare void @llvm.va_end(ptr) #2
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
-
+declare void @llvm.va_end(ptr)
diff --git a/llvm/test/Transforms/NewGVN/pr31501.ll b/llvm/test/Transforms/NewGVN/pr31501.ll
index 353c693..b2ba42b 100644
--- a/llvm/test/Transforms/NewGVN/pr31501.ll
+++ b/llvm/test/Transforms/NewGVN/pr31501.ll
@@ -49,9 +49,9 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 %struct.wombat.28 = type <{ ptr, i8, i8, [6 x i8] }>
 
 ; Function Attrs: norecurse nounwind ssp uwtable
-define weak_odr hidden ptr @quux(ptr %arg, ptr %arg1) local_unnamed_addr #0 align 2 {
+define weak_odr hidden ptr @quux(ptr %arg, ptr %arg1) local_unnamed_addr align 2 {
 ; CHECK-LABEL: define weak_odr hidden ptr @quux(
-; CHECK-SAME: ptr [[ARG:%.*]], ptr [[ARG1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] align 2 {
+; CHECK-SAME: ptr [[ARG:%.*]], ptr [[ARG1:%.*]]) local_unnamed_addr align 2 {
 ; CHECK-NEXT:  [[BB:.*]]:
 ; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds [[STRUCT_BARNEY:%.*]], ptr [[ARG]], i64 0, i32 3, i32 0, i32 0, i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8, !tbaa [[ANYPTR_TBAA2:![0-9]+]]
@@ -112,8 +112,6 @@ bb21:                                             ; preds = %bb19, %bb
   ret ptr %tmp22
 }
 
-attributes #0 = { norecurse nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
 
diff --git a/llvm/test/Transforms/NewGVN/pr33187.ll b/llvm/test/Transforms/NewGVN/pr33187.ll
index 969f172..61eb7a5 100644
--- a/llvm/test/Transforms/NewGVN/pr33187.ll
+++ b/llvm/test/Transforms/NewGVN/pr33187.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ;; Ensure we don't change after value numbering by accidentally deleting the wrong expression.
 ; RUN: opt -passes=newgvn -S %s | FileCheck %s
-define void @fn1(i1 %arg) local_unnamed_addr #0 {
+define void @fn1(i1 %arg) local_unnamed_addr {
 ; CHECK-LABEL: @fn1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_COND_PREHEADER:%.*]]
@@ -89,8 +89,7 @@ if.end18:                                         ; preds = %L, %while.body12
   br label %while.cond10
 }
 
-
-define void @hoge() local_unnamed_addr #0 {
+define void @hoge() local_unnamed_addr {
 ; CHECK-LABEL: @hoge(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    br label [[BB1:%.*]]
@@ -108,9 +107,6 @@ bb1:                                              ; preds = %bb1, %bb
   br label %bb1
 }
 
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-
 define void @a(i1 %arg) {
 ; CHECK-LABEL: @a(
 ; CHECK-NEXT:  b:
@@ -143,4 +139,3 @@ f:                                                ; preds = %d
   %e = getelementptr i8, ptr %i, i64 1
   br label %d
 }
-
diff --git a/llvm/test/Transforms/NewGVN/pr33305.ll b/llvm/test/Transforms/NewGVN/pr33305.ll
index e742f14..ff645f8 100644
--- a/llvm/test/Transforms/NewGVN/pr33305.ll
+++ b/llvm/test/Transforms/NewGVN/pr33305.ll
@@ -16,9 +16,9 @@ target triple = "x86_64-apple-macosx10.12.0"
 @str.2 = private unnamed_addr constant [8 x i8] c"Screwed\00"
 
 ; Function Attrs: nounwind optsize ssp uwtable
-define i32 @main() local_unnamed_addr #0 {
+define i32 @main() local_unnamed_addr {
 ; CHECK-LABEL: define i32 @main(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ) local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[DOTPR_I:%.*]] = load i32, ptr @c, align 4, !tbaa [[INT_TBAA3:![0-9]+]]
 ; CHECK-NEXT:    [[CMP13_I:%.*]] = icmp slt i32 [[DOTPR_I]], 1
@@ -77,7 +77,7 @@ define i32 @main() local_unnamed_addr #0 {
 ; CHECK-NEXT:    br i1 [[TOBOOL]], label %[[IF_END:.*]], label %[[IF_THEN:.*]]
 ; CHECK:       [[IF_THEN]]:
 ; CHECK-NEXT:    [[PUTS2:%.*]] = tail call i32 @puts(ptr @str.2)
-; CHECK-NEXT:    tail call void @abort() #[[ATTR3:[0-9]+]]
+; CHECK-NEXT:    tail call void @abort()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[IF_END]]:
 ; CHECK-NEXT:    [[PUTS:%.*]] = tail call i32 @puts(ptr @str)
@@ -153,7 +153,7 @@ fn1.exit:                                         ; preds = %if.then.i, %for.end
 
 if.then:                                          ; preds = %fn1.exit
   %puts2 = tail call i32 @puts(ptr @str.2)
-  tail call void @abort() #3
+  tail call void @abort()
   unreachable
 
 if.end:                                           ; preds = %fn1.exit
@@ -162,15 +162,10 @@ if.end:                                           ; preds = %fn1.exit
 }
 
 ; Function Attrs: noreturn nounwind optsize
-declare void @abort() local_unnamed_addr #1
+declare void @abort() local_unnamed_addr
 
 ; Function Attrs: nounwind
-declare i32 @puts(ptr nocapture readonly) local_unnamed_addr #2
-
-attributes #0 = { nounwind optsize ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noreturn nounwind optsize "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
-attributes #3 = { noreturn nounwind optsize }
+declare i32 @puts(ptr nocapture readonly) local_unnamed_addr
 
 !llvm.module.flags = !{!0, !1}
 !llvm.ident = !{!2}
diff --git a/llvm/test/Transforms/NewGVN/pr34430.ll b/llvm/test/Transforms/NewGVN/pr34430.ll
index 490ba433..62d5770 100644
--- a/llvm/test/Transforms/NewGVN/pr34430.ll
+++ b/llvm/test/Transforms/NewGVN/pr34430.ll
@@ -4,7 +4,7 @@
 source_filename = "bugpoint-output-e4c7d0f.bc"
 
 ;  Make sure we still properly resolve phi cycles when they involve predicateinfo copies of phis.
-define void @hoge(i1 %arg) local_unnamed_addr #0 {
+define void @hoge(i1 %arg) local_unnamed_addr {
 ; CHECK-LABEL: @hoge(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    br i1 %arg, label [[BB6:%.*]], label [[BB1:%.*]]
@@ -41,8 +41,6 @@ bb6:                                              ; preds = %bb4, %bb2, %bb1, %b
   br label %bb4
 }
 
-attributes #0 = { norecurse noreturn nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 6.0.0"}
diff --git a/llvm/test/Transforms/NewGVN/pr34452.ll b/llvm/test/Transforms/NewGVN/pr34452.ll
index 48bdd88..7c38147 100644
--- a/llvm/test/Transforms/NewGVN/pr34452.ll
+++ b/llvm/test/Transforms/NewGVN/pr34452.ll
@@ -6,9 +6,9 @@ source_filename = "bugpoint-output-09f7a24.bc"
 @WHOLELINE = external local_unnamed_addr global i32, align 4
 
 ; Function Attrs: nounwind uwtable
-define void @sgrep() local_unnamed_addr #0 {
+define void @sgrep() local_unnamed_addr {
 ; CHECK-LABEL: define void @sgrep(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ) local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @WHOLELINE, align 4, !tbaa [[INT_TBAA1:![0-9]+]]
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TMP0]], 0
@@ -36,10 +36,7 @@ while.body.us:                                    ; preds = %while.body.us, %ent
 }
 
 ; Function Attrs: nounwind readnone speculatable
-declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) #1
-
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64)
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/Transforms/OpenMP/dead_use.ll b/llvm/test/Transforms/OpenMP/dead_use.ll
index 1c4b2c6..ad0f91c 100644
--- a/llvm/test/Transforms/OpenMP/dead_use.ll
+++ b/llvm/test/Transforms/OpenMP/dead_use.ll
@@ -6,9 +6,9 @@
 @0 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @.str }, align 8
 
 ; Function Attrs: nounwind uwtable
-define dso_local i32 @b() #0 {
+define dso_local i32 @b() {
 ; CHECK-LABEL: define dso_local i32 @b(
-; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @a()
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
@@ -21,9 +21,9 @@ define dso_local i32 @b() #0 {
 }
 
 ; Function Attrs: nounwind uwtable
-define internal i32 @a() #0 {
+define internal i32 @a() {
 ; CHECK-LABEL: define internal i32 @a(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @b()
 ; CHECK-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB0:[0-9]+]], i32 0, ptr @.omp_outlined.)
@@ -38,9 +38,9 @@ define internal i32 @a() #0 {
 }
 
 ; Function Attrs: norecurse nounwind uwtable
-define internal void @.omp_outlined.(ptr noalias %0, ptr noalias %1) #1 {
+define internal void @.omp_outlined.(ptr noalias %0, ptr noalias %1) {
 ; CHECK-LABEL: define internal void @.omp_outlined.(
-; CHECK-SAME: ptr noalias [[TMP0:%.*]], ptr noalias [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-SAME: ptr noalias [[TMP0:%.*]], ptr noalias [[TMP1:%.*]]) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = alloca ptr, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = alloca ptr, align 8
 ; CHECK-NEXT:    store ptr [[TMP0]], ptr [[TMP3]], align 8, !tbaa [[ANYPTR_TBAA2:![0-9]+]]
@@ -55,11 +55,7 @@ define internal void @.omp_outlined.(ptr noalias %0, ptr noalias %1) #1 {
 }
 
 ; Function Attrs: nounwind
-declare !callback !6 void @__kmpc_fork_call(ptr, i32, ptr, ...) #2
-
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
+declare !callback !6 void @__kmpc_fork_call(ptr, i32, ptr, ...)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/Transforms/OpenMP/icv_remarks.ll b/llvm/test/Transforms/OpenMP/icv_remarks.ll
index f76d487..3caa56f 100644
--- a/llvm/test/Transforms/OpenMP/icv_remarks.ll
+++ b/llvm/test/Transforms/OpenMP/icv_remarks.ll
@@ -14,55 +14,48 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK-DAG: remark: icv_remarks.c:12:0: OpenMP ICV nthreads Value: IMPLEMENTATION_DEFINED
 ; CHECK-DAG: remark: icv_remarks.c:12:0: OpenMP ICV active_levels Value: 0
 ; CHECK-DAG: remark: icv_remarks.c:12:0: OpenMP ICV cancel Value: 0
-define dso_local void @foo(i32 %a) local_unnamed_addr #0 !dbg !17 {
+define dso_local void @foo(i32 %a) local_unnamed_addr !dbg !17 {
 entry:
   %.kmpc_loc.addr = alloca %struct.ident_t, align 8
   call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 8 dereferenceable(24) %.kmpc_loc.addr, ptr nonnull align 8 dereferenceable(24) @0, i64 16, i1 false)
   call void @llvm.dbg.value(metadata i32 %a, metadata !19, metadata !DIExpression()), !dbg !21
-  tail call void @omp_set_num_threads(i32 %a) #1, !dbg !22
-  %call = tail call i32 @omp_get_max_threads() #1, !dbg !23
+  tail call void @omp_set_num_threads(i32 %a), !dbg !22
+  %call = tail call i32 @omp_get_max_threads(), !dbg !23
   call void @llvm.dbg.value(metadata i32 %call, metadata !20, metadata !DIExpression()), !dbg !21
-  tail call void @use(i32 %call) #1, !dbg !24
+  tail call void @use(i32 %call), !dbg !24
   %0 = getelementptr inbounds %struct.ident_t, ptr %.kmpc_loc.addr, i64 0, i32 4, !dbg !25
   store ptr @1, ptr %0, align 8, !dbg !25, !tbaa !26
-  call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr nonnull %.kmpc_loc.addr, i32 0, ptr @.omp_outlined.) #1, !dbg !25
+  call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr nonnull %.kmpc_loc.addr, i32 0, ptr @.omp_outlined.), !dbg !25
   ret void, !dbg !32
 }
 
-declare !dbg !4 dso_local void @omp_set_num_threads(i32) local_unnamed_addr #1
+declare !dbg !4 dso_local void @omp_set_num_threads(i32) local_unnamed_addr
 
-declare !dbg !9 dso_local i32 @omp_get_max_threads() local_unnamed_addr #1
+declare !dbg !9 dso_local i32 @omp_get_max_threads() local_unnamed_addr
 
-declare !dbg !12 dso_local void @use(i32) local_unnamed_addr #2
+declare !dbg !12 dso_local void @use(i32) local_unnamed_addr
 
 ; CHECK-DAG: remark: icv_remarks.c:18:0: OpenMP ICV nthreads Value: IMPLEMENTATION_DEFINED
 ; CHECK-DAG: remark: icv_remarks.c:18:0: OpenMP ICV active_levels Value: 0
 ; CHECK-DAG: remark: icv_remarks.c:18:0: OpenMP ICV cancel Value: 0
-define internal void @.omp_outlined.(ptr noalias nocapture readnone %.global_tid., ptr noalias nocapture readnone %.bound_tid.) #3 !dbg !33 {
+define internal void @.omp_outlined.(ptr noalias nocapture readnone %.global_tid., ptr noalias nocapture readnone %.bound_tid.) !dbg !33 {
 entry:
   call void @llvm.dbg.value(metadata ptr %.global_tid., metadata !41, metadata !DIExpression()), !dbg !43
   call void @llvm.dbg.value(metadata ptr %.bound_tid., metadata !42, metadata !DIExpression()), !dbg !43
-  call void @llvm.dbg.value(metadata ptr undef, metadata !44, metadata !DIExpression()) #1, !dbg !50
-  call void @llvm.dbg.value(metadata ptr undef, metadata !47, metadata !DIExpression()) #1, !dbg !50
-  tail call void @omp_set_num_threads(i32 10) #1, !dbg !52
-  %call.i = tail call i32 @omp_get_max_threads() #1, !dbg !53
-  call void @llvm.dbg.value(metadata i32 %call.i, metadata !48, metadata !DIExpression()) #1, !dbg !54
-  tail call void @use(i32 %call.i) #1, !dbg !55
+  call void @llvm.dbg.value(metadata ptr undef, metadata !44, metadata !DIExpression()), !dbg !50
+  call void @llvm.dbg.value(metadata ptr undef, metadata !47, metadata !DIExpression()), !dbg !50
+  tail call void @omp_set_num_threads(i32 10), !dbg !52
+  %call.i = tail call i32 @omp_get_max_threads(), !dbg !53
+  call void @llvm.dbg.value(metadata i32 %call.i, metadata !48, metadata !DIExpression()), !dbg !54
+  tail call void @use(i32 %call.i), !dbg !55
   ret void, !dbg !56
 }
 
-declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #4
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
 
-declare !callback !57 dso_local void @__kmpc_fork_call(ptr, i32, ptr, ...) local_unnamed_addr #1
+declare !callback !57 dso_local void @__kmpc_fork_call(ptr, i32, ptr, ...) local_unnamed_addr
 
-declare void @llvm.dbg.value(metadata, metadata, metadata) #5
-
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { argmemonly nounwind willreturn }
-attributes #5 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!13, !14, !15, !59}
diff --git a/llvm/test/Transforms/PGOProfile/misexpect-branch-correct.ll b/llvm/test/Transforms/PGOProfile/misexpect-branch-correct.ll
index 0a1efda..eb1cc5e 100644
--- a/llvm/test/Transforms/PGOProfile/misexpect-branch-correct.ll
+++ b/llvm/test/Transforms/PGOProfile/misexpect-branch-correct.ll
@@ -9,7 +9,6 @@
 ; CHECK-NOT: remark: {{.*}}
 ; CHECK: !{!"branch_weights", i32 0, i32 200000}
 
-
 ; ModuleID = 'misexpect-branch-correct.c'
 source_filename = "misexpect-branch-correct.c"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -19,14 +18,14 @@ target triple = "x86_64-unknown-linux-gnu"
 @outer_loop = constant i32 2000, align 4
 
 ; Function Attrs: nounwind
-define i32 @bar() #0 {
+define i32 @bar() {
 entry:
   %rando = alloca i32, align 4
   %x = alloca i32, align 4
-  call void @llvm.lifetime.start.p0(ptr %rando) #4
+  call void @llvm.lifetime.start.p0(ptr %rando)
   %call = call i32 (...) @buzz()
   store i32 %call, ptr %rando, align 4, !tbaa !3
-  call void @llvm.lifetime.start.p0(ptr %x) #4
+  call void @llvm.lifetime.start.p0(ptr %x)
   store i32 0, ptr %x, align 4, !tbaa !3
   %0 = load i32, ptr %rando, align 4, !tbaa !3
   %rem = srem i32 %0, 200000
@@ -52,31 +51,25 @@ if.else:                                          ; preds = %entry
 
 if.end:                                           ; preds = %if.else, %if.then
   %2 = load i32, ptr %x, align 4, !tbaa !3
-  call void @llvm.lifetime.end.p0(ptr %x) #4
-  call void @llvm.lifetime.end.p0(ptr %rando) #4
+  call void @llvm.lifetime.end.p0(ptr %x)
+  call void @llvm.lifetime.end.p0(ptr %rando)
   ret i32 %2
 }
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture)
 
-declare i32 @buzz(...) #2
+declare i32 @buzz(...)
 
 ; Function Attrs: nounwind readnone willreturn
-declare i64 @llvm.expect.i64(i64, i64) #3
+declare i64 @llvm.expect.i64(i64, i64)
 
-declare i32 @baz(i32) #2
+declare i32 @baz(i32)
 
-declare i32 @foo(i32) #2
+declare i32 @foo(i32)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind readnone willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.lifetime.end.p0(ptr nocapture)
 
 !llvm.module.flags = !{!0, !1}
 !llvm.ident = !{!2}
diff --git a/llvm/test/Transforms/PGOProfile/misexpect-branch-overflow.ll b/llvm/test/Transforms/PGOProfile/misexpect-branch-overflow.ll
index 68b233e..3390825 100644
--- a/llvm/test/Transforms/PGOProfile/misexpect-branch-overflow.ll
+++ b/llvm/test/Transforms/PGOProfile/misexpect-branch-overflow.ll
@@ -16,14 +16,14 @@ target triple = "x86_64-unknown-linux-gnu"
 @outer_loop = constant i32 2000, align 4
 
 ; Function Attrs: nounwind
-define i32 @bar() #0 !dbg !6 {
+define i32 @bar() !dbg !6 {
 entry:
   %rando = alloca i32, align 4
   %x = alloca i32, align 4
-  call void @llvm.lifetime.start.p0(ptr %rando) #4, !dbg !9
+  call void @llvm.lifetime.start.p0(ptr %rando), !dbg !9
   %call = call i32 (...) @buzz(), !dbg !9
   store i32 %call, ptr %rando, align 4, !dbg !9, !tbaa !10
-  call void @llvm.lifetime.start.p0(ptr %x) #4, !dbg !14
+  call void @llvm.lifetime.start.p0(ptr %x), !dbg !14
   store i32 0, ptr %x, align 4, !dbg !14, !tbaa !10
   %0 = load i32, ptr %rando, align 4, !dbg !15, !tbaa !10
   %rem = srem i32 %0, 200000, !dbg !15
@@ -49,31 +49,25 @@ if.else:                                          ; preds = %entry
 
 if.end:                                           ; preds = %if.else, %if.then
   %2 = load i32, ptr %x, align 4, !dbg !19, !tbaa !10
-  call void @llvm.lifetime.end.p0(ptr %x) #4, !dbg !20
-  call void @llvm.lifetime.end.p0(ptr %rando) #4, !dbg !20
+  call void @llvm.lifetime.end.p0(ptr %x), !dbg !20
+  call void @llvm.lifetime.end.p0(ptr %rando), !dbg !20
   ret i32 %2, !dbg !19
 }
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture)
 
-declare i32 @buzz(...) #2
+declare i32 @buzz(...)
 
 ; Function Attrs: nounwind readnone willreturn
-declare i64 @llvm.expect.i64(i64, i64) #3
+declare i64 @llvm.expect.i64(i64, i64)
 
-declare i32 @baz(i32) #2
+declare i32 @baz(i32)
 
-declare i32 @foo(i32) #2
+declare i32 @foo(i32)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind readnone willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.lifetime.end.p0(ptr nocapture)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
diff --git a/llvm/test/Transforms/PGOProfile/misexpect-branch-stripped.ll b/llvm/test/Transforms/PGOProfile/misexpect-branch-stripped.ll
index 2f188f5..d34708c 100644
--- a/llvm/test/Transforms/PGOProfile/misexpect-branch-stripped.ll
+++ b/llvm/test/Transforms/PGOProfile/misexpect-branch-stripped.ll
@@ -19,7 +19,6 @@
 ; DISABLED-NOT: warning: <unknown>:0:0: 19.98%
 ; DISABLED-NOT: remark: <unknown>:0:0: Potential performance regression from use of the llvm.expect intrinsic: Annotation was correct on 19.98% (399668 / 2000000) of profiled executions.
 
-
 ; ModuleID = 'misexpect-branch.c'
 source_filename = "misexpect-branch.c"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -29,14 +28,14 @@ target triple = "x86_64-unknown-linux-gnu"
 @outer_loop = constant i32 2000, align 4
 
 ; Function Attrs: nounwind
-define i32 @bar() #0 {
+define i32 @bar() {
 entry:
   %rando = alloca i32, align 4
   %x = alloca i32, align 4
-  call void @llvm.lifetime.start.p0(ptr %rando) #4
+  call void @llvm.lifetime.start.p0(ptr %rando)
   %call = call i32 (...) @buzz()
   store i32 %call, ptr %rando, align 4, !tbaa !3
-  call void @llvm.lifetime.start.p0(ptr %x) #4
+  call void @llvm.lifetime.start.p0(ptr %x)
   store i32 0, ptr %x, align 4, !tbaa !3
   %0 = load i32, ptr %rando, align 4, !tbaa !3
   %rem = srem i32 %0, 200000
@@ -62,31 +61,25 @@ if.else:                                          ; preds = %entry
 
 if.end:                                           ; preds = %if.else, %if.then
   %2 = load i32, ptr %x, align 4, !tbaa !3
-  call void @llvm.lifetime.end.p0(ptr %x) #4
-  call void @llvm.lifetime.end.p0(ptr %rando) #4
+  call void @llvm.lifetime.end.p0(ptr %x)
+  call void @llvm.lifetime.end.p0(ptr %rando)
   ret i32 %2
 }
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture)
 
-declare i32 @buzz(...) #2
+declare i32 @buzz(...)
 
 ; Function Attrs: nounwind readnone willreturn
-declare i64 @llvm.expect.i64(i64, i64) #3
+declare i64 @llvm.expect.i64(i64, i64)
 
-declare i32 @baz(i32) #2
+declare i32 @baz(i32)
 
-declare i32 @foo(i32) #2
+declare i32 @foo(i32)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind readnone willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.lifetime.end.p0(ptr nocapture)
 
 !llvm.module.flags = !{!0, !1}
 !llvm.ident = !{!2}
diff --git a/llvm/test/Transforms/PGOProfile/misexpect-branch-unpredictable.ll b/llvm/test/Transforms/PGOProfile/misexpect-branch-unpredictable.ll
index 4add781..a43e632 100644
--- a/llvm/test/Transforms/PGOProfile/misexpect-branch-unpredictable.ll
+++ b/llvm/test/Transforms/PGOProfile/misexpect-branch-unpredictable.ll
@@ -7,7 +7,6 @@
 ; CHECK-NOT: warning: {{.*}}
 ; CHECK-NOT: remark: {{.*}}
 
-
 ; ModuleID = 'misexpect-branch-unpredictable.c'
 source_filename = "clang/test/Profile/misexpect-branch-unpredictable.c"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -17,14 +16,14 @@ target triple = "x86_64-unknown-linux-gnu"
 @outer_loop = constant i32 2000, align 4
 
 ; Function Attrs: nounwind
-define i32 @bar() #0 {
+define i32 @bar() {
 entry:
   %rando = alloca i32, align 4
   %x = alloca i32, align 4
-  call void @llvm.lifetime.start.p0(ptr %rando) #3
+  call void @llvm.lifetime.start.p0(ptr %rando)
   %call = call i32 (...) @buzz()
   store i32 %call, ptr %rando, align 4, !tbaa !2
-  call void @llvm.lifetime.start.p0(ptr %x) #3
+  call void @llvm.lifetime.start.p0(ptr %x)
   store i32 0, ptr %x, align 4, !tbaa !2
   %0 = load i32, ptr %rando, align 4, !tbaa !2
   %rem = srem i32 %0, 200000
@@ -49,27 +48,22 @@ if.else:                                          ; preds = %entry
 
 if.end:                                           ; preds = %if.else, %if.then
   %2 = load i32, ptr %x, align 4, !tbaa !2
-  call void @llvm.lifetime.end.p0(ptr %x) #3
-  call void @llvm.lifetime.end.p0(ptr %rando) #3
+  call void @llvm.lifetime.end.p0(ptr %x)
+  call void @llvm.lifetime.end.p0(ptr %rando)
   ret i32 %2
 }
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture)
 
-declare i32 @buzz(...) #2
+declare i32 @buzz(...)
 
-declare i32 @baz(i32) #2
+declare i32 @baz(i32)
 
-declare i32 @foo(i32) #2
+declare i32 @foo(i32)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+declare void @llvm.lifetime.end.p0(ptr nocapture)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/Transforms/PGOProfile/misexpect-branch.ll b/llvm/test/Transforms/PGOProfile/misexpect-branch.ll
index 5a7731b..7e01f7a 100644
--- a/llvm/test/Transforms/PGOProfile/misexpect-branch.ll
+++ b/llvm/test/Transforms/PGOProfile/misexpect-branch.ll
@@ -33,14 +33,14 @@ target triple = "x86_64-unknown-linux-gnu"
 @outer_loop = constant i32 2000, align 4
 
 ; Function Attrs: nounwind
-define i32 @bar() #0 !dbg !6 {
+define i32 @bar() !dbg !6 {
 entry:
   %rando = alloca i32, align 4
   %x = alloca i32, align 4
-  call void @llvm.lifetime.start.p0(ptr %rando) #4, !dbg !9
+  call void @llvm.lifetime.start.p0(ptr %rando), !dbg !9
   %call = call i32 (...) @buzz(), !dbg !9
   store i32 %call, ptr %rando, align 4, !dbg !9, !tbaa !10
-  call void @llvm.lifetime.start.p0(ptr %x) #4, !dbg !14
+  call void @llvm.lifetime.start.p0(ptr %x), !dbg !14
   store i32 0, ptr %x, align 4, !dbg !14, !tbaa !10
   %0 = load i32, ptr %rando, align 4, !dbg !15, !tbaa !10
   %rem = srem i32 %0, 200000, !dbg !15
@@ -66,31 +66,25 @@ if.else:                                          ; preds = %entry
 
 if.end:                                           ; preds = %if.else, %if.then
   %2 = load i32, ptr %x, align 4, !dbg !19, !tbaa !10
-  call void @llvm.lifetime.end.p0(ptr %x) #4, !dbg !20
-  call void @llvm.lifetime.end.p0(ptr %rando) #4, !dbg !20
+  call void @llvm.lifetime.end.p0(ptr %x), !dbg !20
+  call void @llvm.lifetime.end.p0(ptr %rando), !dbg !20
   ret i32 %2, !dbg !19
 }
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture)
 
-declare i32 @buzz(...) #2
+declare i32 @buzz(...)
 
 ; Function Attrs: nounwind readnone willreturn
-declare i64 @llvm.expect.i64(i64, i64) #3
+declare i64 @llvm.expect.i64(i64, i64)
 
-declare i32 @baz(i32) #2
+declare i32 @baz(i32)
 
-declare i32 @foo(i32) #2
+declare i32 @foo(i32)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind readnone willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.lifetime.end.p0(ptr nocapture)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
diff --git a/llvm/test/Transforms/PGOProfile/misexpect-switch-default.ll b/llvm/test/Transforms/PGOProfile/misexpect-switch-default.ll
index 859ba72..38c27b9 100644
--- a/llvm/test/Transforms/PGOProfile/misexpect-switch-default.ll
+++ b/llvm/test/Transforms/PGOProfile/misexpect-switch-default.ll
@@ -25,7 +25,6 @@
 ; CORRECT-NOT: warning: {{.*}}
 ; CORRECT-NOT: remark: {{.*}}
 
-
 ; ModuleID = 'misexpect-switch.c'
 source_filename = "misexpect-switch.c"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -37,10 +36,10 @@ target triple = "x86_64-unknown-linux-gnu"
 @arry = dso_local global [25 x i32] zeroinitializer, align 16
 
 ; Function Attrs: nounwind uwtable
-define dso_local void @init_arry() #0 {
+define dso_local void @init_arry() {
 entry:
   %i = alloca i32, align 4
-  call void @llvm.lifetime.start.p0(ptr %i) #6
+  call void @llvm.lifetime.start.p0(ptr %i)
   store i32 0, ptr %i, align 4, !tbaa !4
   br label %for.cond
 
@@ -50,7 +49,7 @@ for.cond:                                         ; preds = %for.inc, %entry
   br i1 %cmp, label %for.body, label %for.end
 
 for.body:                                         ; preds = %for.cond
-  %call = call i32 @rand() #6
+  %call = call i32 @rand()
   %rem = srem i32 %call, 10
   %1 = load i32, ptr %i, align 4, !tbaa !4
   %idxprom = sext i32 %1 to i64
@@ -65,24 +64,24 @@ for.inc:                                          ; preds = %for.body
   br label %for.cond
 
 for.end:                                          ; preds = %for.cond
-  call void @llvm.lifetime.end.p0(ptr %i) #6
+  call void @llvm.lifetime.end.p0(ptr %i)
   ret void
 }
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: nounwind
-declare dso_local i32 @rand() #3
+declare dso_local i32 @rand()
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(ptr nocapture)
 
 ; Function Attrs: nounwind uwtable
-define dso_local i32 @main() #0 {
+define dso_local i32 @main() {
 entry:
   %retval = alloca i32, align 4
   %val = alloca i32, align 4
@@ -90,9 +89,9 @@ entry:
   %condition = alloca i32, align 4
   store i32 0, ptr %retval, align 4
   call void @init_arry()
-  call void @llvm.lifetime.start.p0(ptr %val) #6
+  call void @llvm.lifetime.start.p0(ptr %val)
   store i32 0, ptr %val, align 4, !tbaa !4
-  call void @llvm.lifetime.start.p0(ptr %j) #6
+  call void @llvm.lifetime.start.p0(ptr %j)
   store i32 0, ptr %j, align 4, !tbaa !4
   br label %for.cond
 
@@ -102,8 +101,8 @@ for.cond:                                         ; preds = %for.inc, %entry
   br i1 %cmp, label %for.body, label %for.end
 
 for.body:                                         ; preds = %for.cond
-  call void @llvm.lifetime.start.p0(ptr %condition) #6
-  %call = call i32 @rand() #6
+  call void @llvm.lifetime.start.p0(ptr %condition)
+  %call = call i32 @rand()
   %rem = srem i32 %call, 5
   store i32 %rem, ptr %condition, align 4, !tbaa !4
   %1 = load i32, ptr %condition, align 4, !tbaa !4
@@ -138,7 +137,7 @@ sw.default:                                       ; preds = %for.body
   unreachable
 
 sw.epilog:                                        ; preds = %sw.bb3, %sw.bb2, %sw.bb
-  call void @llvm.lifetime.end.p0(ptr %condition) #6
+  call void @llvm.lifetime.end.p0(ptr %condition)
   br label %for.inc
 
 for.inc:                                          ; preds = %sw.epilog
@@ -148,25 +147,17 @@ for.inc:                                          ; preds = %sw.epilog
   br label %for.cond
 
 for.end:                                          ; preds = %for.cond
-  call void @llvm.lifetime.end.p0(ptr %j) #6
-  call void @llvm.lifetime.end.p0(ptr %val) #6
+  call void @llvm.lifetime.end.p0(ptr %j)
+  call void @llvm.lifetime.end.p0(ptr %val)
   ret i32 0
 }
 
 ; Function Attrs: nounwind readnone willreturn
-declare i64 @llvm.expect.i64(i64, i64) #4
-
-declare dso_local i32 @sum(ptr, i32) #5
+declare i64 @llvm.expect.i64(i64, i64)
 
-declare dso_local i32 @random_sample(ptr, i32) #5
+declare dso_local i32 @sum(ptr, i32)
 
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { nounwind readnone speculatable willreturn }
-attributes #3 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind readnone willreturn }
-attributes #5 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #6 = { nounwind }
+declare dso_local i32 @random_sample(ptr, i32)
 
 !llvm.module.flags = !{!0, !1, !2}
 !llvm.ident = !{!3}
diff --git a/llvm/test/Transforms/PGOProfile/misexpect-switch.ll b/llvm/test/Transforms/PGOProfile/misexpect-switch.ll
index 242d5b8..9665559 100644
--- a/llvm/test/Transforms/PGOProfile/misexpect-switch.ll
+++ b/llvm/test/Transforms/PGOProfile/misexpect-switch.ll
@@ -28,7 +28,6 @@
 ; CORRECT-NOT: warning: {{.*}}
 ; CORRECT-NOT: remark: {{.*}}
 
-
 ; ModuleID = 'misexpect-switch.c'
 source_filename = "misexpect-switch.c"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -40,10 +39,10 @@ target triple = "x86_64-unknown-linux-gnu"
 @arry = dso_local global [25 x i32] zeroinitializer, align 16, !dbg !12
 
 ; Function Attrs: nounwind uwtable
-define dso_local void @init_arry() #0 !dbg !21 {
+define dso_local void @init_arry() !dbg !21 {
 entry:
   %i = alloca i32, align 4
-  call void @llvm.lifetime.start.p0(ptr %i) #6, !dbg !26
+  call void @llvm.lifetime.start.p0(ptr %i), !dbg !26
   call void @llvm.dbg.declare(metadata ptr %i, metadata !25, metadata !DIExpression()), !dbg !27
   store i32 0, ptr %i, align 4, !dbg !28, !tbaa !30
   br label %for.cond, !dbg !34
@@ -54,7 +53,7 @@ for.cond:                                         ; preds = %for.inc, %entry
   br i1 %cmp, label %for.body, label %for.end, !dbg !38
 
 for.body:                                         ; preds = %for.cond
-  %call = call i32 @rand() #6, !dbg !39
+  %call = call i32 @rand(), !dbg !39
   %rem = srem i32 %call, 10, !dbg !41
   %1 = load i32, ptr %i, align 4, !dbg !42, !tbaa !30
   %idxprom = sext i32 %1 to i64, !dbg !43
@@ -69,24 +68,24 @@ for.inc:                                          ; preds = %for.body
   br label %for.cond, !dbg !47, !llvm.loop !48
 
 for.end:                                          ; preds = %for.cond
-  call void @llvm.lifetime.end.p0(ptr %i) #6, !dbg !50
+  call void @llvm.lifetime.end.p0(ptr %i), !dbg !50
   ret void, !dbg !50
 }
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: nounwind
-declare dso_local i32 @rand() #3
+declare dso_local i32 @rand()
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(ptr nocapture)
 
 ; Function Attrs: nounwind uwtable
-define dso_local i32 @main() #0 !dbg !51 {
+define dso_local i32 @main() !dbg !51 {
 entry:
   %retval = alloca i32, align 4
   %val = alloca i32, align 4
@@ -94,10 +93,10 @@ entry:
   %condition = alloca i32, align 4
   store i32 0, ptr %retval, align 4
   call void @init_arry(), !dbg !62
-  call void @llvm.lifetime.start.p0(ptr %val) #6, !dbg !63
+  call void @llvm.lifetime.start.p0(ptr %val), !dbg !63
   call void @llvm.dbg.declare(metadata ptr %val, metadata !55, metadata !DIExpression()), !dbg !64
   store i32 0, ptr %val, align 4, !dbg !64, !tbaa !30
-  call void @llvm.lifetime.start.p0(ptr %j) #6, !dbg !65
+  call void @llvm.lifetime.start.p0(ptr %j), !dbg !65
   call void @llvm.dbg.declare(metadata ptr %j, metadata !56, metadata !DIExpression()), !dbg !66
   store i32 0, ptr %j, align 4, !dbg !67, !tbaa !30
   br label %for.cond, !dbg !68
@@ -108,9 +107,9 @@ for.cond:                                         ; preds = %for.inc, %entry
   br i1 %cmp, label %for.body, label %for.end, !dbg !71
 
 for.body:                                         ; preds = %for.cond
-  call void @llvm.lifetime.start.p0(ptr %condition) #6, !dbg !72
+  call void @llvm.lifetime.start.p0(ptr %condition), !dbg !72
   call void @llvm.dbg.declare(metadata ptr %condition, metadata !57, metadata !DIExpression()), !dbg !73
-  %call = call i32 @rand() #6, !dbg !74
+  %call = call i32 @rand(), !dbg !74
   %rem = srem i32 %call, 5, !dbg !75
   store i32 %rem, ptr %condition, align 4, !dbg !73, !tbaa !30
   %1 = load i32, ptr %condition, align 4, !dbg !76, !tbaa !30
@@ -145,7 +144,7 @@ sw.default:                                       ; preds = %for.body
   unreachable, !dbg !87
 
 sw.epilog:                                        ; preds = %sw.bb3, %sw.bb2, %sw.bb
-  call void @llvm.lifetime.end.p0(ptr %condition) #6, !dbg !88
+  call void @llvm.lifetime.end.p0(ptr %condition), !dbg !88
   br label %for.inc, !dbg !89
 
 for.inc:                                          ; preds = %sw.epilog
@@ -155,25 +154,17 @@ for.inc:                                          ; preds = %sw.epilog
   br label %for.cond, !dbg !91, !llvm.loop !92
 
 for.end:                                          ; preds = %for.cond
-  call void @llvm.lifetime.end.p0(ptr %j) #6, !dbg !94
-  call void @llvm.lifetime.end.p0(ptr %val) #6, !dbg !94
+  call void @llvm.lifetime.end.p0(ptr %j), !dbg !94
+  call void @llvm.lifetime.end.p0(ptr %val), !dbg !94
   ret i32 0, !dbg !95
 }
 
 ; Function Attrs: nounwind readnone willreturn
-declare i64 @llvm.expect.i64(i64, i64) #4
-
-declare dso_local i32 @sum(ptr, i32) #5
+declare i64 @llvm.expect.i64(i64, i64)
 
-declare dso_local i32 @random_sample(ptr, i32) #5
+declare dso_local i32 @sum(ptr, i32)
 
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { nounwind readnone speculatable willreturn }
-attributes #3 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind readnone willreturn }
-attributes #5 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #6 = { nounwind }
+declare dso_local i32 @random_sample(ptr, i32)
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!17, !18, !19}
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_muladd.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_muladd.ll
index 27e8fd0..3b61750 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_muladd.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_muladd.ll
@@ -6,7 +6,7 @@ target triple = "aarch64"
 
 ; This function (a 16x reduction of a[i] * b[i]) should be vectorized successfully.
 
-define dso_local nofpclass(nan inf) float @vmlaq(ptr noundef %0, ptr noundef %1) #0 {
+define dso_local nofpclass(nan inf) float @vmlaq(ptr noundef %0, ptr noundef %1) {
 ; CHECK-LABEL: define dso_local nofpclass(nan inf) float @vmlaq
 ; CHECK-SAME: (ptr noundef readonly captures(none) [[TMP0:%.*]], ptr noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x float>, ptr [[TMP0]], align 4, !tbaa [[TBAA4:![0-9]+]]
@@ -21,9 +21,9 @@ define dso_local nofpclass(nan inf) float @vmlaq(ptr noundef %0, ptr noundef %1)
   %6 = alloca i32, align 4
   store ptr %0, ptr %3, align 8, !tbaa !4
   store ptr %1, ptr %4, align 8, !tbaa !4
-  call void @llvm.lifetime.start.p0(ptr %5) #2
+  call void @llvm.lifetime.start.p0(ptr %5)
   store float 0.000000e+00, ptr %5, align 4, !tbaa !9
-  call void @llvm.lifetime.start.p0(ptr %6) #2
+  call void @llvm.lifetime.start.p0(ptr %6)
   store i32 0, ptr %6, align 4, !tbaa !11
   br label %7
 
@@ -33,7 +33,7 @@ define dso_local nofpclass(nan inf) float @vmlaq(ptr noundef %0, ptr noundef %1)
   br i1 %9, label %11, label %10
 
 10:                                               ; preds = %7
-  call void @llvm.lifetime.end.p0(ptr %6) #2
+  call void @llvm.lifetime.end.p0(ptr %6)
   br label %28
 
 11:                                               ; preds = %7
@@ -61,16 +61,12 @@ define dso_local nofpclass(nan inf) float @vmlaq(ptr noundef %0, ptr noundef %1)
 
 28:                                               ; preds = %10
   %29 = load float, ptr %5, align 4, !tbaa !9
-  call void @llvm.lifetime.end.p0(ptr %5) #2
+  call void @llvm.lifetime.end.p0(ptr %5)
   ret float %29
 }
 
-declare void @llvm.lifetime.start.p0(ptr captures(none)) #1
-declare void @llvm.lifetime.end.p0(ptr captures(none)) #1
-
-attributes #0 = { nounwind uwtable "frame-pointer"="non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+v8a,-fmv" "unsafe-fp-math"="true" }
-attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
-attributes #2 = { nounwind }
+declare void @llvm.lifetime.start.p0(ptr captures(none))
+declare void @llvm.lifetime.end.p0(ptr captures(none))
 
 !llvm.module.flags = !{!0, !1, !2}
 !llvm.ident = !{!3}
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_submuladd.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_submuladd.ll
index 68bfbc1..eefde9d 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_submuladd.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_submuladd.ll
@@ -6,7 +6,7 @@ target triple = "aarch64"
 
 ; This function (a more complex reduction of (a[i] - b[i]) * itself) should be vectorized successfully.
 
-define dso_local noundef nofpclass(nan inf) float @_Z4testPKfS0_ii(ptr noundef %0, ptr noundef %1, i32 noundef %2, i32 noundef %3) #0 {
+define dso_local noundef nofpclass(nan inf) float @_Z4testPKfS0_ii(ptr noundef %0, ptr noundef %1, i32 noundef %2, i32 noundef %3) {
 ; CHECK-LABEL: define dso_local noundef nofpclass(nan inf) float @_Z4testPKfS0_ii
 ; CHECK-SAME: (ptr noundef readonly captures(none) [[TMP0:%.*]], ptr noundef readonly captures(none) [[TMP1:%.*]], i32 noundef [[TMP2:%.*]], i32 noundef [[TMP3:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  .preheader.i:
@@ -125,7 +125,7 @@ define dso_local noundef nofpclass(nan inf) float @_Z4testPKfS0_ii(ptr noundef %
   ret float %13
 }
 
-define internal noundef nofpclass(nan inf) float @_ZL6reduceILi7EEfPKfS1_ii(ptr noundef %0, ptr noundef %1, i32 noundef %2, i32 noundef %3) #1 {
+define internal noundef nofpclass(nan inf) float @_ZL6reduceILi7EEfPKfS1_ii(ptr noundef %0, ptr noundef %1, i32 noundef %2, i32 noundef %3) {
   %5 = alloca ptr, align 8
   %6 = alloca ptr, align 8
   %7 = alloca i32, align 4
@@ -143,15 +143,15 @@ define internal noundef nofpclass(nan inf) float @_ZL6reduceILi7EEfPKfS1_ii(ptr
   store ptr %1, ptr %6, align 8, !tbaa !4
   store i32 %2, ptr %7, align 4, !tbaa !9
   store i32 %3, ptr %8, align 4, !tbaa !9
-  call void @llvm.lifetime.start.p0(ptr %9) #3
+  call void @llvm.lifetime.start.p0(ptr %9)
   store i32 3, ptr %9, align 4, !tbaa !9
-  call void @llvm.lifetime.start.p0(ptr %10) #3
+  call void @llvm.lifetime.start.p0(ptr %10)
   store i32 3, ptr %10, align 4, !tbaa !9
-  call void @llvm.lifetime.start.p0(ptr %11) #3
+  call void @llvm.lifetime.start.p0(ptr %11)
   store i32 7, ptr %11, align 4, !tbaa !9
-  call void @llvm.lifetime.start.p0(ptr %12) #3
+  call void @llvm.lifetime.start.p0(ptr %12)
   store float 0.000000e+00, ptr %12, align 4, !tbaa !11
-  call void @llvm.lifetime.start.p0(ptr %13) #3
+  call void @llvm.lifetime.start.p0(ptr %13)
   store i32 0, ptr %13, align 4, !tbaa !9
   br label %18
 
@@ -162,13 +162,13 @@ define internal noundef nofpclass(nan inf) float @_ZL6reduceILi7EEfPKfS1_ii(ptr
 
 21:                                               ; preds = %18
   store i32 2, ptr %14, align 4
-  call void @llvm.lifetime.end.p0(ptr %13) #3
+  call void @llvm.lifetime.end.p0(ptr %13)
   br label %62
 
 22:                                               ; preds = %18
-  call void @llvm.lifetime.start.p0(ptr %15) #3
+  call void @llvm.lifetime.start.p0(ptr %15)
   store float 0.000000e+00, ptr %15, align 4, !tbaa !11
-  call void @llvm.lifetime.start.p0(ptr %16) #3
+  call void @llvm.lifetime.start.p0(ptr %16)
   store i32 0, ptr %16, align 4, !tbaa !9
   br label %23
 
@@ -179,11 +179,11 @@ define internal noundef nofpclass(nan inf) float @_ZL6reduceILi7EEfPKfS1_ii(ptr
 
 26:                                               ; preds = %23
   store i32 5, ptr %14, align 4
-  call void @llvm.lifetime.end.p0(ptr %16) #3
+  call void @llvm.lifetime.end.p0(ptr %16)
   br label %47
 
 27:                                               ; preds = %23
-  call void @llvm.lifetime.start.p0(ptr %17) #3
+  call void @llvm.lifetime.start.p0(ptr %17)
   %28 = load ptr, ptr %5, align 8, !tbaa !4
   %29 = load i32, ptr %16, align 4, !tbaa !9
   %30 = sext i32 %29 to i64
@@ -202,7 +202,7 @@ define internal noundef nofpclass(nan inf) float @_ZL6reduceILi7EEfPKfS1_ii(ptr
   %42 = load float, ptr %15, align 4, !tbaa !11
   %43 = fadd fast float %42, %41
   store float %43, ptr %15, align 4, !tbaa !11
-  call void @llvm.lifetime.end.p0(ptr %17) #3
+  call void @llvm.lifetime.end.p0(ptr %17)
   br label %44
 
 44:                                               ; preds = %27
@@ -226,7 +226,7 @@ define internal noundef nofpclass(nan inf) float @_ZL6reduceILi7EEfPKfS1_ii(ptr
   %57 = load float, ptr %12, align 4, !tbaa !11
   %58 = fadd fast float %57, %56
   store float %58, ptr %12, align 4, !tbaa !11
-  call void @llvm.lifetime.end.p0(ptr %15) #3
+  call void @llvm.lifetime.end.p0(ptr %15)
   br label %59
 
 59:                                               ; preds = %47
@@ -238,20 +238,15 @@ define internal noundef nofpclass(nan inf) float @_ZL6reduceILi7EEfPKfS1_ii(ptr
 62:                                               ; preds = %21
   %63 = load float, ptr %12, align 4, !tbaa !11
   store i32 1, ptr %14, align 4
-  call void @llvm.lifetime.end.p0(ptr %12) #3
-  call void @llvm.lifetime.end.p0(ptr %11) #3
-  call void @llvm.lifetime.end.p0(ptr %10) #3
-  call void @llvm.lifetime.end.p0(ptr %9) #3
+  call void @llvm.lifetime.end.p0(ptr %12)
+  call void @llvm.lifetime.end.p0(ptr %11)
+  call void @llvm.lifetime.end.p0(ptr %10)
+  call void @llvm.lifetime.end.p0(ptr %9)
   ret float %63
 }
 
-declare void @llvm.lifetime.start.p0(ptr captures(none)) #2
-declare void @llvm.lifetime.end.p0(ptr captures(none)) #2
-
-attributes #0 = { mustprogress uwtable "frame-pointer"="non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+v8a,-fmv" "unsafe-fp-math"="true" }
-attributes #1 = { inlinehint mustprogress nounwind uwtable "frame-pointer"="non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+v8a,-fmv" "unsafe-fp-math"="true" }
-attributes #2 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
-attributes #3 = { nounwind }
+declare void @llvm.lifetime.start.p0(ptr captures(none))
+declare void @llvm.lifetime.end.p0(ptr captures(none))
 
 !llvm.module.flags = !{!0, !1, !2}
 !llvm.ident = !{!3}
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll
index 92e625d..82ecc3a 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll
@@ -10,7 +10,7 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
 target triple = "aarch64"
 
 ; Function Attrs: nounwind uwtable
-define i32 @slpordering(ptr noundef %p1, i32 noundef %ip1, ptr noundef %p2, i32 noundef %ip2) #0 {
+define i32 @slpordering(ptr noundef %p1, i32 noundef %ip1, ptr noundef %p2, i32 noundef %ip2) {
 ; CHECK-LABEL: define range(i32 0, 65536) i32 @slpordering(
 ; CHECK-SAME: ptr noundef readonly captures(none) [[P1:%.*]], i32 noundef [[IP1:%.*]], ptr noundef readonly captures(none) [[P2:%.*]], i32 noundef [[IP2:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
@@ -136,14 +136,14 @@ entry:
   store i32 %ip1, ptr %ip1.addr, align 4, !tbaa !8
   store ptr %p2, ptr %p2.addr, align 8, !tbaa !4
   store i32 %ip2, ptr %ip2.addr, align 4, !tbaa !8
-  call void @llvm.lifetime.start.p0(ptr %emp) #2
-  call void @llvm.lifetime.start.p0(ptr %r0) #2
-  call void @llvm.lifetime.start.p0(ptr %r1) #2
-  call void @llvm.lifetime.start.p0(ptr %r2) #2
-  call void @llvm.lifetime.start.p0(ptr %r3) #2
-  call void @llvm.lifetime.start.p0(ptr %sum) #2
+  call void @llvm.lifetime.start.p0(ptr %emp)
+  call void @llvm.lifetime.start.p0(ptr %r0)
+  call void @llvm.lifetime.start.p0(ptr %r1)
+  call void @llvm.lifetime.start.p0(ptr %r2)
+  call void @llvm.lifetime.start.p0(ptr %r3)
+  call void @llvm.lifetime.start.p0(ptr %sum)
   store i32 0, ptr %sum, align 4, !tbaa !8
-  call void @llvm.lifetime.start.p0(ptr %i) #2
+  call void @llvm.lifetime.start.p0(ptr %i)
   store i32 0, ptr %i, align 4, !tbaa !8
   br label %for.cond
 
@@ -153,7 +153,7 @@ for.cond:                                         ; preds = %for.inc, %entry
   br i1 %cmp, label %for.body, label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %for.cond
-  call void @llvm.lifetime.end.p0(ptr %i) #2
+  call void @llvm.lifetime.end.p0(ptr %i)
   br label %for.end
 
 for.body:                                         ; preds = %for.cond
@@ -241,22 +241,22 @@ for.body:                                         ; preds = %for.cond
   %shl42 = shl i32 %sub41, 16
   %rdd43 = add nsw i32 %sub36, %shl42
   store i32 %rdd43, ptr %r3, align 4, !tbaa !8
-  call void @llvm.lifetime.start.p0(ptr %e0) #2
+  call void @llvm.lifetime.start.p0(ptr %e0)
   %33 = load i32, ptr %r0, align 4, !tbaa !8
   %34 = load i32, ptr %r1, align 4, !tbaa !8
   %rdd44 = add i32 %33, %34
   store i32 %rdd44, ptr %e0, align 4, !tbaa !8
-  call void @llvm.lifetime.start.p0(ptr %e1) #2
+  call void @llvm.lifetime.start.p0(ptr %e1)
   %35 = load i32, ptr %r0, align 4, !tbaa !8
   %36 = load i32, ptr %r1, align 4, !tbaa !8
   %sub45 = sub i32 %35, %36
   store i32 %sub45, ptr %e1, align 4, !tbaa !8
-  call void @llvm.lifetime.start.p0(ptr %e2) #2
+  call void @llvm.lifetime.start.p0(ptr %e2)
   %37 = load i32, ptr %r2, align 4, !tbaa !8
   %38 = load i32, ptr %r3, align 4, !tbaa !8
   %rdd46 = add i32 %37, %38
   store i32 %rdd46, ptr %e2, align 4, !tbaa !8
-  call void @llvm.lifetime.start.p0(ptr %e3) #2
+  call void @llvm.lifetime.start.p0(ptr %e3)
   %39 = load i32, ptr %r2, align 4, !tbaa !8
   %40 = load i32, ptr %r3, align 4, !tbaa !8
   %sub47 = sub i32 %39, %40
@@ -293,10 +293,10 @@ for.body:                                         ; preds = %for.cond
   %rrrayidx61 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 %idxprom60
   %rrrayidx62 = getelementptr inbounds [4 x i32], ptr %rrrayidx61, i64 0, i64 3
   store i32 %sub59, ptr %rrrayidx62, align 4, !tbaa !8
-  call void @llvm.lifetime.end.p0(ptr %e3) #2
-  call void @llvm.lifetime.end.p0(ptr %e2) #2
-  call void @llvm.lifetime.end.p0(ptr %e1) #2
-  call void @llvm.lifetime.end.p0(ptr %e0) #2
+  call void @llvm.lifetime.end.p0(ptr %e3)
+  call void @llvm.lifetime.end.p0(ptr %e2)
+  call void @llvm.lifetime.end.p0(ptr %e1)
+  call void @llvm.lifetime.end.p0(ptr %e0)
   br label %for.inc
 
 for.inc:                                          ; preds = %for.body
@@ -316,7 +316,7 @@ for.inc:                                          ; preds = %for.body
   br label %for.cond, !llvm.loop !11
 
 for.end:                                          ; preds = %for.cond.cleanup
-  call void @llvm.lifetime.start.p0(ptr %i65) #2
+  call void @llvm.lifetime.start.p0(ptr %i65)
   store i32 0, ptr %i65, align 4, !tbaa !8
   br label %for.cond66
 
@@ -326,11 +326,11 @@ for.cond66:                                       ; preds = %for.inc114, %for.en
   br i1 %cmp67, label %for.body70, label %for.cond.cleanup69
 
 for.cond.cleanup69:                               ; preds = %for.cond66
-  call void @llvm.lifetime.end.p0(ptr %i65) #2
+  call void @llvm.lifetime.end.p0(ptr %i65)
   br label %for.end116
 
 for.body70:                                       ; preds = %for.cond66
-  call void @llvm.lifetime.start.p0(ptr %e071) #2
+  call void @llvm.lifetime.start.p0(ptr %e071)
   %rrrayidx72 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 0
   %59 = load i32, ptr %i65, align 4, !tbaa !8
   %idxprom73 = sext i32 %59 to i64
@@ -343,7 +343,7 @@ for.body70:                                       ; preds = %for.cond66
   %62 = load i32, ptr %rrrayidx77, align 4, !tbaa !8
   %rdd78 = add i32 %60, %62
   store i32 %rdd78, ptr %e071, align 4, !tbaa !8
-  call void @llvm.lifetime.start.p0(ptr %e179) #2
+  call void @llvm.lifetime.start.p0(ptr %e179)
   %rrrayidx80 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 0
   %63 = load i32, ptr %i65, align 4, !tbaa !8
   %idxprom81 = sext i32 %63 to i64
@@ -356,7 +356,7 @@ for.body70:                                       ; preds = %for.cond66
   %66 = load i32, ptr %rrrayidx85, align 4, !tbaa !8
   %sub86 = sub i32 %64, %66
   store i32 %sub86, ptr %e179, align 4, !tbaa !8
-  call void @llvm.lifetime.start.p0(ptr %e287) #2
+  call void @llvm.lifetime.start.p0(ptr %e287)
   %rrrayidx88 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 2
   %67 = load i32, ptr %i65, align 4, !tbaa !8
   %idxprom89 = sext i32 %67 to i64
@@ -369,7 +369,7 @@ for.body70:                                       ; preds = %for.cond66
   %70 = load i32, ptr %rrrayidx93, align 4, !tbaa !8
   %rdd94 = add i32 %68, %70
   store i32 %rdd94, ptr %e287, align 4, !tbaa !8
-  call void @llvm.lifetime.start.p0(ptr %e395) #2
+  call void @llvm.lifetime.start.p0(ptr %e395)
   %rrrayidx96 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 2
   %71 = load i32, ptr %i65, align 4, !tbaa !8
   %idxprom97 = sext i32 %71 to i64
@@ -398,10 +398,10 @@ for.body70:                                       ; preds = %for.cond66
   %82 = load i32, ptr %e395, align 4, !tbaa !8
   %sub106 = sub nsw i32 %81, %82
   store i32 %sub106, ptr %r3, align 4, !tbaa !8
-  call void @llvm.lifetime.end.p0(ptr %e395) #2
-  call void @llvm.lifetime.end.p0(ptr %e287) #2
-  call void @llvm.lifetime.end.p0(ptr %e179) #2
-  call void @llvm.lifetime.end.p0(ptr %e071) #2
+  call void @llvm.lifetime.end.p0(ptr %e395)
+  call void @llvm.lifetime.end.p0(ptr %e287)
+  call void @llvm.lifetime.end.p0(ptr %e179)
+  call void @llvm.lifetime.end.p0(ptr %e071)
   %83 = load i32, ptr %r0, align 4, !tbaa !8
   %call = call i32 @twoabs(i32 noundef %83)
   %84 = load i32, ptr %r1, align 4, !tbaa !8
@@ -432,28 +432,28 @@ for.end116:                                       ; preds = %for.cond.cleanup69
   %shr = lshr i32 %90, 16
   %rdd119 = add i32 %conv118, %shr
   %shr120 = lshr i32 %rdd119, 1
-  call void @llvm.lifetime.end.p0(ptr %sum) #2
-  call void @llvm.lifetime.end.p0(ptr %r3) #2
-  call void @llvm.lifetime.end.p0(ptr %r2) #2
-  call void @llvm.lifetime.end.p0(ptr %r1) #2
-  call void @llvm.lifetime.end.p0(ptr %r0) #2
-  call void @llvm.lifetime.end.p0(ptr %emp) #2
+  call void @llvm.lifetime.end.p0(ptr %sum)
+  call void @llvm.lifetime.end.p0(ptr %r3)
+  call void @llvm.lifetime.end.p0(ptr %r2)
+  call void @llvm.lifetime.end.p0(ptr %r1)
+  call void @llvm.lifetime.end.p0(ptr %r0)
+  call void @llvm.lifetime.end.p0(ptr %emp)
   ret i32 %shr120
 }
 
 ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture)
 
 ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(ptr nocapture)
 
 ; Function Attrs: nounwind uwtable
-define internal i32 @twoabs(i32 noundef %r) #0 {
+define internal i32 @twoabs(i32 noundef %r) {
 entry:
   %r.addr = alloca i32, align 4
   %s = alloca i32, align 4
   store i32 %r, ptr %r.addr, align 4, !tbaa !8
-  call void @llvm.lifetime.start.p0(ptr %s) #2
+  call void @llvm.lifetime.start.p0(ptr %s)
   %0 = load i32, ptr %r.addr, align 4, !tbaa !8
   %shr = lshr i32 %0, 15
   %rnd = and i32 %shr, 65537
@@ -464,14 +464,10 @@ entry:
   %rdd = add i32 %1, %2
   %3 = load i32, ptr %s, align 4, !tbaa !8
   %xor = xor i32 %rdd, %3
-  call void @llvm.lifetime.end.p0(ptr %s) #2
+  call void @llvm.lifetime.end.p0(ptr %s)
   ret i32 %xor
 }
 
-attributes #0 = { nounwind uwtable "frame-pointer"="non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+v8a,-fmv" "unsafe-fp-math"="true" }
-attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
-attributes #2 = { nounwind }
-
 !4 = !{!5, !5, i64 0}
 !5 = !{!"any pointer", !6, i64 0}
 !6 = !{!"omnipotent char", !7, i64 0}
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll
index 7fb72e6..811957c 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll
@@ -170,5 +170,5 @@ unreachable:                                      ; preds = %cleanup
 declare void @llvm.lifetime.start.p0(ptr nocapture)
 declare void @llvm.lifetime.end.p0(ptr nocapture)
 
-attributes #0 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" "unsafe-fp-math"="true" }
-attributes #1 = { alwaysinline nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" "unsafe-fp-math"="true" }
+attributes #0 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" }
+attributes #1 = { alwaysinline nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" }
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_fill_q7.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_fill_q7.ll
index 436f848a..6735577 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_fill_q7.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_fill_q7.ll
@@ -9,7 +9,7 @@ target triple = "thumbv6m-none-none-eabi"
 ; should be deleted, too.
 
 ; Function Attrs: nounwind
-define dso_local void @arm_fill_q7(i8 signext %value, ptr %pDst, i32 %blockSize) #0 {
+define dso_local void @arm_fill_q7(i8 signext %value, ptr %pDst, i32 %blockSize) {
 ; OLDPM-LABEL: @arm_fill_q7(
 ; OLDPM-NEXT:  entry:
 ; OLDPM-NEXT:    [[CMP_NOT20:%.*]] = icmp ult i32 [[BLOCKSIZE:%.*]], 4
@@ -59,8 +59,8 @@ entry:
   store i8 %value, ptr %value.addr, align 1, !tbaa !3
   store ptr %pDst, ptr %pDst.addr, align 4, !tbaa !6
   store i32 %blockSize, ptr %blockSize.addr, align 4, !tbaa !8
-  call void @llvm.lifetime.start.p0(ptr %blkCnt) #3
-  call void @llvm.lifetime.start.p0(ptr %packedValue) #3
+  call void @llvm.lifetime.start.p0(ptr %blkCnt)
+  call void @llvm.lifetime.start.p0(ptr %packedValue)
   %0 = load i8, ptr %value.addr, align 1, !tbaa !3
   %conv = sext i8 %0 to i32
   %shl = shl i32 %conv, 0
@@ -122,23 +122,23 @@ while.body16:                                     ; preds = %while.cond13
   br label %while.cond13, !llvm.loop !12
 
 while.end18:                                      ; preds = %while.cond13
-  call void @llvm.lifetime.end.p0(ptr %packedValue) #3
-  call void @llvm.lifetime.end.p0(ptr %blkCnt) #3
+  call void @llvm.lifetime.end.p0(ptr %packedValue)
+  call void @llvm.lifetime.end.p0(ptr %blkCnt)
   ret void
 }
 
 ; Function Attrs: argmemonly nofree nosync nounwind willreturn
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture)
 
 ; Function Attrs: alwaysinline nounwind
-define internal void @write_q7x4_ia(ptr %pQ7, i32 %value) #2 {
+define internal void @write_q7x4_ia(ptr %pQ7, i32 %value) {
 entry:
   %pQ7.addr = alloca ptr, align 4
   %value.addr = alloca i32, align 4
   %val = alloca i32, align 4
   store ptr %pQ7, ptr %pQ7.addr, align 4, !tbaa !6
   store i32 %value, ptr %value.addr, align 4, !tbaa !8
-  call void @llvm.lifetime.start.p0(ptr %val) #3
+  call void @llvm.lifetime.start.p0(ptr %val)
   %0 = load i32, ptr %value.addr, align 4, !tbaa !8
   store i32 %0, ptr %val, align 4, !tbaa !8
   %1 = load i32, ptr %val, align 4, !tbaa !8
@@ -175,17 +175,12 @@ entry:
   %14 = load ptr, ptr %13, align 4, !tbaa !6
   %add.ptr = getelementptr inbounds i8, ptr %14, i32 4
   store ptr %add.ptr, ptr %13, align 4, !tbaa !6
-  call void @llvm.lifetime.end.p0(ptr %val) #3
+  call void @llvm.lifetime.end.p0(ptr %val)
   ret void
 }
 
 ; Function Attrs: argmemonly nofree nosync nounwind willreturn
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m0plus" "target-features"="+armv6-m,+strict-align,+thumb-mode,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-dotprod,-dsp,-fp16fml,-fullfp16,-hwdiv,-hwdiv-arm,-i8mm,-lob,-mve,-mve.fp,-ras,-sb,-sha2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nofree nosync nounwind willreturn }
-attributes #2 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m0plus" "target-features"="+armv6-m,+strict-align,+thumb-mode,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-dotprod,-dsp,-fp16fml,-fullfp16,-hwdiv,-hwdiv-arm,-i8mm,-lob,-mve,-mve.fp,-ras,-sb,-sha2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+declare void @llvm.lifetime.end.p0(ptr nocapture)
 
 !llvm.module.flags = !{!0, !1}
 !llvm.ident = !{!2}
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll
index c186207..4274719 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll
@@ -133,7 +133,7 @@ declare void @llvm.lifetime.start.p0(ptr nocapture) #1
 declare i32 @llvm.arm.mve.addv.v16i8(<16 x i8>, i32) #2
 declare void @llvm.lifetime.end.p0(ptr nocapture) #1
 
-attributes #0 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-pacbti,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" "unsafe-fp-math"="true" }
+attributes #0 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-pacbti,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" }
 attributes #1 = { argmemonly nocallback nofree nosync nounwind willreturn }
 attributes #2 = { nounwind readnone }
 attributes #3 = { nounwind }
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
index 42fdafb..5127b7d 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
@@ -216,7 +216,7 @@ unreachable:                                      ; preds = %cleanup
 
 declare void @llvm.lifetime.end.p0(ptr nocapture) #1
 
-attributes #0 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" "unsafe-fp-math"="true" }
+attributes #0 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" }
 attributes #1 = { argmemonly nofree nosync nounwind willreturn }
-attributes #2 = { alwaysinline nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" "unsafe-fp-math"="true" }
+attributes #2 = { alwaysinline nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" }
 attributes #3 = { nounwind }
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll
index 8e25c9c..c5f56d3 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll
@@ -15,7 +15,7 @@
 ; Ideally, this should reach the backend with 1 fmul, 1 fsub, 1 fadd, and 1 shuffle.
 ; That may require some coordination between VectorCombine, SLP, and other passes.
 
-define <4 x float> @buildvector_mul_addsub_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) #0 {
+define <4 x float> @buildvector_mul_addsub_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) {
 ; CHECK-LABEL: @buildvector_mul_addsub_ps128(
 ; CHECK-NEXT:    [[A:%.*]] = fmul <4 x float> [[C:%.*]], [[D:%.*]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = fsub <4 x float> [[A]], [[B:%.*]]
@@ -43,7 +43,7 @@ define <4 x float> @buildvector_mul_addsub_ps128(<4 x float> %C, <4 x float> %D,
   ret <4 x float> %vecinsert4
 }
 
-define <2 x double> @buildvector_mul_addsub_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) #0 {
+define <2 x double> @buildvector_mul_addsub_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) {
 ; CHECK-LABEL: @buildvector_mul_addsub_pd128(
 ; CHECK-NEXT:    [[A:%.*]] = fmul <2 x double> [[C:%.*]], [[D:%.*]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = fsub <2 x double> [[A]], [[B:%.*]]
@@ -63,7 +63,7 @@ define <2 x double> @buildvector_mul_addsub_pd128(<2 x double> %C, <2 x double>
   ret <2 x double> %vecinsert2
 }
 
-define <8 x float> @buildvector_mul_addsub_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) #0 {
+define <8 x float> @buildvector_mul_addsub_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) {
 ; SSE2-LABEL: @buildvector_mul_addsub_ps256(
 ; SSE2-NEXT:    [[A:%.*]] = fmul <8 x float> [[C:%.*]], [[D:%.*]]
 ; SSE2-NEXT:    [[TMP0:%.*]] = fsub <8 x float> [[A]], [[B:%.*]]
@@ -123,7 +123,7 @@ define <8 x float> @buildvector_mul_addsub_ps256(<8 x float> %C, <8 x float> %D,
   ret <8 x float> %vecinsert8
 }
 
-define <4 x double> @buildvector_mul_addsub_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) #0 {
+define <4 x double> @buildvector_mul_addsub_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) {
 ; CHECK-LABEL: @buildvector_mul_addsub_pd256(
 ; CHECK-NEXT:    [[A:%.*]] = fmul <4 x double> [[C:%.*]], [[D:%.*]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = fsub <4 x double> [[A]], [[B:%.*]]
@@ -151,7 +151,7 @@ define <4 x double> @buildvector_mul_addsub_pd256(<4 x double> %C, <4 x double>
   ret <4 x double> %vecinsert4
 }
 
-define <16 x float> @buildvector_mul_addsub_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 {
+define <16 x float> @buildvector_mul_addsub_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) {
 ; SSE2-LABEL: @buildvector_mul_addsub_ps512(
 ; SSE2-NEXT:    [[A:%.*]] = fmul <16 x float> [[C:%.*]], [[D:%.*]]
 ; SSE2-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A]], [[B:%.*]]
@@ -243,7 +243,7 @@ define <16 x float> @buildvector_mul_addsub_ps512(<16 x float> %C, <16 x float>
   ret <16 x float> %vecinsert16
 }
 
-define <16 x float> @buildvector_mul_addsub_ps512_partial(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 {
+define <16 x float> @buildvector_mul_addsub_ps512_partial(<16 x float> %C, <16 x float> %D, <16 x float> %B) {
 ; SSE-LABEL: @buildvector_mul_addsub_ps512_partial(
 ; SSE-NEXT:    [[A:%.*]] = fmul <16 x float> [[C:%.*]], [[D:%.*]]
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13>
@@ -350,7 +350,7 @@ define <16 x float> @buildvector_mul_addsub_ps512_partial(<16 x float> %C, <16 x
   ret <16 x float> %vecinsert16
 }
 
-define <8 x double> @buildvector_mul_addsub_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 {
+define <8 x double> @buildvector_mul_addsub_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) {
 ; SSE2-LABEL: @buildvector_mul_addsub_pd512(
 ; SSE2-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
 ; SSE2-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A]], [[B:%.*]]
@@ -410,7 +410,7 @@ define <8 x double> @buildvector_mul_addsub_pd512(<8 x double> %C, <8 x double>
   ret <8 x double> %vecinsert8
 }
 
-define <8 x double> @buildvector_mul_addsub_pd512_partial(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 {
+define <8 x double> @buildvector_mul_addsub_pd512_partial(<8 x double> %C, <8 x double> %D, <8 x double> %B) {
 ; SSE-LABEL: @buildvector_mul_addsub_pd512_partial(
 ; SSE-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
 ; SSE-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A]], [[B:%.*]]
@@ -507,7 +507,7 @@ define <8 x double> @buildvector_mul_addsub_pd512_partial(<8 x double> %C, <8 x
   ret <8 x double> %vecinsert8
 }
 
-define <4 x float> @buildvector_mul_subadd_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) #0 {
+define <4 x float> @buildvector_mul_subadd_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) {
 ; CHECK-LABEL: @buildvector_mul_subadd_ps128(
 ; CHECK-NEXT:    [[A:%.*]] = fmul <4 x float> [[C:%.*]], [[D:%.*]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = fadd <4 x float> [[A]], [[B:%.*]]
@@ -535,7 +535,7 @@ define <4 x float> @buildvector_mul_subadd_ps128(<4 x float> %C, <4 x float> %D,
   ret <4 x float> %vecinsert4
 }
 
-define <2 x double> @buildvector_mul_subadd_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) #0 {
+define <2 x double> @buildvector_mul_subadd_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) {
 ; CHECK-LABEL: @buildvector_mul_subadd_pd128(
 ; CHECK-NEXT:    [[A:%.*]] = fmul <2 x double> [[C:%.*]], [[D:%.*]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = fadd <2 x double> [[A]], [[B:%.*]]
@@ -555,7 +555,7 @@ define <2 x double> @buildvector_mul_subadd_pd128(<2 x double> %C, <2 x double>
   ret <2 x double> %vecinsert2
 }
 
-define <8 x float> @buildvector_mul_subadd_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) #0 {
+define <8 x float> @buildvector_mul_subadd_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) {
 ; SSE2-LABEL: @buildvector_mul_subadd_ps256(
 ; SSE2-NEXT:    [[A:%.*]] = fmul <8 x float> [[C:%.*]], [[D:%.*]]
 ; SSE2-NEXT:    [[TMP0:%.*]] = fadd <8 x float> [[A]], [[B:%.*]]
@@ -634,7 +634,7 @@ define <8 x float> @buildvector_mul_subadd_ps256(<8 x float> %C, <8 x float> %D,
   ret <8 x float> %vecinsert8
 }
 
-define <4 x double> @buildvector_mul_subadd_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) #0 {
+define <4 x double> @buildvector_mul_subadd_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) {
 ; CHECK-LABEL: @buildvector_mul_subadd_pd256(
 ; CHECK-NEXT:    [[A:%.*]] = fmul <4 x double> [[C:%.*]], [[D:%.*]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = fadd <4 x double> [[A]], [[B:%.*]]
@@ -662,7 +662,7 @@ define <4 x double> @buildvector_mul_subadd_pd256(<4 x double> %C, <4 x double>
   ret <4 x double> %vecinsert4
 }
 
-define <16 x float> @buildvector_mul_subadd_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 {
+define <16 x float> @buildvector_mul_subadd_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) {
 ; SSE-LABEL: @buildvector_mul_subadd_ps512(
 ; SSE-NEXT:    [[A:%.*]] = fmul <16 x float> [[C:%.*]], [[D:%.*]]
 ; SSE-NEXT:    [[TMP1:%.*]] = fadd <16 x float> [[A]], [[B:%.*]]
@@ -756,7 +756,7 @@ define <16 x float> @buildvector_mul_subadd_ps512(<16 x float> %C, <16 x float>
   ret <16 x float> %vecinsert16
 }
 
-define <16 x float> @buildvector_mul_subadd_ps512_partial(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 {
+define <16 x float> @buildvector_mul_subadd_ps512_partial(<16 x float> %C, <16 x float> %D, <16 x float> %B) {
 ; SSE-LABEL: @buildvector_mul_subadd_ps512_partial(
 ; SSE-NEXT:    [[A:%.*]] = fmul <16 x float> [[C:%.*]], [[D:%.*]]
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13>
@@ -863,7 +863,7 @@ define <16 x float> @buildvector_mul_subadd_ps512_partial(<16 x float> %C, <16 x
   ret <16 x float> %vecinsert16
 }
 
-define <8 x double> @buildvector_mul_subadd_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 {
+define <8 x double> @buildvector_mul_subadd_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) {
 ; SSE-LABEL: @buildvector_mul_subadd_pd512(
 ; SSE-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
 ; SSE-NEXT:    [[TMP0:%.*]] = fadd <8 x double> [[A]], [[B:%.*]]
@@ -925,7 +925,7 @@ define <8 x double> @buildvector_mul_subadd_pd512(<8 x double> %C, <8 x double>
   ret <8 x double> %vecinsert8
 }
 
-define <8 x double> @buildvector_mul_subadd_pd512_partial(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 {
+define <8 x double> @buildvector_mul_subadd_pd512_partial(<8 x double> %C, <8 x double> %D, <8 x double> %B) {
 ; SSE-LABEL: @buildvector_mul_subadd_pd512_partial(
 ; SSE-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
 ; SSE-NEXT:    [[TMP1:%.*]] = fadd <8 x double> [[A]], [[B:%.*]]
@@ -1021,5 +1021,3 @@ define <8 x double> @buildvector_mul_subadd_pd512_partial(<8 x double> %C, <8 x
   %vecinsert8 = insertelement <8 x double> %vecinsert7, double %add7, i32 7
   ret <8 x double> %vecinsert8
 }
-
-attributes #0 = { nounwind "unsafe-fp-math"="true" }
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vdiv-nounroll.ll b/llvm/test/Transforms/PhaseOrdering/X86/vdiv-nounroll.ll
index ae6f4a7..57637d6 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vdiv-nounroll.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vdiv-nounroll.ll
@@ -14,7 +14,7 @@ target triple = "x86_64-apple-macosx11.0.0"
 ;        a[i] /= b;
 ;  }
 
-define void @vdiv(ptr %a, float %b) #0 {
+define void @vdiv(ptr %a, float %b) {
 ; CHECK-LABEL: define void @vdiv(
 ; CHECK-SAME: ptr captures(none) [[A:%.*]], float [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
@@ -40,7 +40,7 @@ entry:
   %i = alloca i32, align 4
   store ptr %a, ptr %a.addr, align 8, !tbaa !3
   store float %b, ptr %b.addr, align 4, !tbaa !7
-  call void @llvm.lifetime.start.p0(ptr %i) #2
+  call void @llvm.lifetime.start.p0(ptr %i)
   store i32 0, ptr %i, align 4, !tbaa !9
   br label %for.cond
 
@@ -50,7 +50,7 @@ for.cond:                                         ; preds = %for.inc, %entry
   br i1 %cmp, label %for.body, label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %for.cond
-  call void @llvm.lifetime.end.p0(ptr %i) #2
+  call void @llvm.lifetime.end.p0(ptr %i)
   br label %for.end
 
 for.body:                                         ; preds = %for.cond
@@ -74,12 +74,8 @@ for.end:                                          ; preds = %for.cond.cleanup
   ret void
 }
 
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
-
-attributes #0 = { nounwind ssp uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "tune-cpu"="generic" "unsafe-fp-math"="true" }
-attributes #1 = { argmemonly nofree nosync nounwind willreturn }
-attributes #2 = { nounwind }
+declare void @llvm.lifetime.start.p0(ptr nocapture)
+declare void @llvm.lifetime.end.p0(ptr nocapture)
 
 !llvm.module.flags = !{!0, !1}
 !llvm.ident = !{!2}
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
index bcdf90c..bfb8554 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
@@ -211,7 +211,7 @@ for.end:
   ret void
 }
 
-attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "use-soft-float"="false" }
 
 !llvm.module.flags = !{!0, !1}
 !llvm.ident = !{!2}
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll
index dd5ff12..987a528 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll
@@ -8,7 +8,7 @@
 target triple = "x86_64--"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-define i32 @add_v4i32(ptr %p) #0 {
+define i32 @add_v4i32(ptr %p) {
 ; CHECK-LABEL: @add_v4i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 4, !tbaa [[TBAA0:![0-9]+]]
@@ -46,7 +46,7 @@ for.end:
   ret i32 %r.0
 }
 
-define signext i16 @mul_v8i16(ptr %p) #0 {
+define signext i16 @mul_v8i16(ptr %p) {
 ; CHECK-LABEL: @mul_v8i16(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 2, !tbaa [[TBAA4:![0-9]+]]
@@ -89,7 +89,7 @@ for.end:
   ret i16 %r.0
 }
 
-define signext i8 @or_v16i8(ptr %p) #0 {
+define signext i8 @or_v16i8(ptr %p) {
 ; CHECK-LABEL: @or_v16i8(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1, !tbaa [[TBAA6:![0-9]+]]
@@ -134,7 +134,7 @@ for.end:
   ret i8 %r.0
 }
 
-define i32 @smin_v4i32(ptr %p) #0 {
+define i32 @smin_v4i32(ptr %p) {
 ; CHECK-LABEL: @smin_v4i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 4, !tbaa [[TBAA0]]
@@ -185,7 +185,7 @@ for.end:
   ret i32 %r.0
 }
 
-define i32 @umax_v4i32(ptr %p) #0 {
+define i32 @umax_v4i32(ptr %p) {
 ; CHECK-LABEL: @umax_v4i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 4, !tbaa [[TBAA0]]
@@ -236,7 +236,7 @@ for.end:
   ret i32 %r.0
 }
 
-define float @fadd_v4i32(ptr %p) #0 {
+define float @fadd_v4i32(ptr %p) {
 ; CHECK-LABEL: @fadd_v4i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4, !tbaa [[TBAA7:![0-9]+]]
@@ -275,7 +275,7 @@ for.end:
   ret float %r.0
 }
 
-define float @fmul_v4i32(ptr %p) #0 {
+define float @fmul_v4i32(ptr %p) {
 ; CHECK-LABEL: @fmul_v4i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4, !tbaa [[TBAA7]]
@@ -315,7 +315,7 @@ for.end:
   ret float %r.0
 }
 
-define float @fmin_v4f32(ptr %p) #0 {
+define float @fmin_v4f32(ptr %p) {
 ; CHECK-LABEL: @fmin_v4f32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4, !tbaa [[TBAA7]]
@@ -440,8 +440,6 @@ entry:
   ret float %call13
 }
 
-attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+avx,+cx16,+cx8,+fxsr,+mmx,+popcnt,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "unsafe-fp-math"="true" "use-soft-float"="false" }
-
 !0 = !{i32 1, !"wchar_size", i32 4}
 !1 = !{i32 7, !"PIC Level", i32 2}
 !2 = !{!"clang version 11.0.0 (https://github.com/llvm/llvm-project.git a9fe69c359de653015c39e413e48630d069abe27)"}
diff --git a/llvm/test/Transforms/PhaseOrdering/always-inline-alloca-promotion.ll b/llvm/test/Transforms/PhaseOrdering/always-inline-alloca-promotion.ll
index 63279b0..92ea2c6 100644
--- a/llvm/test/Transforms/PhaseOrdering/always-inline-alloca-promotion.ll
+++ b/llvm/test/Transforms/PhaseOrdering/always-inline-alloca-promotion.ll
@@ -35,9 +35,8 @@ define void @ham() #1 {
 ; CHECK-NEXT:  [[SNORK_EXIT:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr inttoptr (i64 48 to ptr), align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i64 [[TMP0]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i64 0)
-; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[TMP1]], <vscale x 16 x float> [[TMP2]], <vscale x 16 x float> undef
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[SPEC_SELECT]], i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> undef, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP1]], <vscale x 4 x float> zeroinitializer, <vscale x 4 x float> [[TMP2]]
 ; CHECK-NEXT:    tail call void @llvm.aarch64.sme.mopa.nxv4f32(i32 0, <vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, <vscale x 4 x float> zeroinitializer, <vscale x 4 x float> [[TMP3]])
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SCCP/conditions-ranges.ll b/llvm/test/Transforms/SCCP/conditions-ranges.ll
index a3cf23b..f793814 100644
--- a/llvm/test/Transforms/SCCP/conditions-ranges.ll
+++ b/llvm/test/Transforms/SCCP/conditions-ranges.ll
@@ -1547,3 +1547,28 @@ bb2:
   call void @use(i1 %c4)
   ret void
 }
+
+define i1 @and_predicate_dominating_phi(i32 %x) {
+; CHECK-LABEL: @and_predicate_dominating_phi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[XGE1:%.*]] = icmp uge i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[XLT2:%.*]] = icmp ult i32 [[X]], 2
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[XGE1]], [[XLT2]]
+; CHECK-NEXT:    br i1 [[AND]], label [[PHI:%.*]], label [[NOPE:%.*]]
+; CHECK:       nope:
+; CHECK-NEXT:    br label [[PHI]]
+; CHECK:       phi:
+; CHECK-NEXT:    ret i1 true
+;
+entry:
+  %xge1 = icmp uge i32 %x, 1
+  %xlt2 = icmp ult i32 %x, 2
+  %and = and i1 %xge1, %xlt2
+  br i1 %and, label %phi, label %nope
+nope:
+  br label %phi
+phi:
+  %res = phi i32 [ %x, %entry ], [ 1, %nope ]
+  %ret = icmp uge i32 %res, 1
+  ret i1 %ret
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/NVPTX/vectorizable-intrinsic.ll b/llvm/test/Transforms/SLPVectorizer/NVPTX/vectorizable-intrinsic.ll
index 3a56258..0bd9a99 100644
--- a/llvm/test/Transforms/SLPVectorizer/NVPTX/vectorizable-intrinsic.ll
+++ b/llvm/test/Transforms/SLPVectorizer/NVPTX/vectorizable-intrinsic.ll
@@ -5,9 +5,9 @@ target datalayout = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx--nvidiacl"
 
 ; Vector versions of the intrinsics are scalarized, so keep them scalar
-define <2 x i8> @cltz_test(<2 x i8> %x) #0 {
+define <2 x i8> @cltz_test(<2 x i8> %x) {
 ; CHECK-LABEL: define <2 x i8> @cltz_test(
-; CHECK-SAME: <2 x i8> [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: <2 x i8> [[X:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i8> [[X]], i32 0
 ; CHECK-NEXT:    [[CALL_I:%.*]] = call i8 @llvm.ctlz.i8(i8 [[TMP0]], i1 false)
@@ -27,10 +27,9 @@ entry:
   ret <2 x i8> %vecinit2
 }
 
-
-define <2 x i8> @cltz_test_poison(<2 x i8> %x) #0 {
+define <2 x i8> @cltz_test_poison(<2 x i8> %x) {
 ; CHECK-LABEL: define <2 x i8> @cltz_test_poison(
-; CHECK-SAME: <2 x i8> [[X:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x i8> [[X:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i8> [[X]], i32 0
 ; CHECK-NEXT:    [[CALL_I:%.*]] = call i8 @llvm.ctlz.i8(i8 [[TMP0]], i1 false)
@@ -50,7 +49,5 @@ entry:
   ret <2 x i8> %vecinit2
 }
 
-declare i8 @llvm.ctlz.i8(i8, i1) #3
-
-attributes #0 = { alwaysinline nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare i8 @llvm.ctlz.i8(i8, i1)
+ "unsafe-fp-math"="false"
+\ No newline at end of file
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll
index 29589f3..def73d7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll
@@ -7,7 +7,7 @@ target triple = "x86_64-apple-macosx10.8.0"
 %class.btVector3.23.221.463.485.507.573.595.683.727.749.815.837.991.1585.1607.1629.1651.1849.2047.2069.2091.2113 = type { [4 x float] }
 
 ; Function Attrs: ssp uwtable
-define void @_ZN11HullLibrary15CleanupVerticesEjPK9btVector3jRjPS0_fRS0_(ptr %vertices, i1 %arg) #0 align 2 {
+define void @_ZN11HullLibrary15CleanupVerticesEjPK9btVector3jRjPS0_fRS0_(ptr %vertices, i1 %arg) align 2 {
 ; CHECK-LABEL: @_ZN11HullLibrary15CleanupVerticesEjPK9btVector3jRjPS0_fRS0_(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[RETURN:%.*]], label [[IF_END:%.*]]
@@ -128,5 +128,3 @@ if.then17.2:                                      ; preds = %if.end22.1
 if.end22.2:                                       ; preds = %if.then17.2, %if.end22.1
   br i1 %arg, label %for.end36, label %for.body
 }
-
-attributes #0 = { ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_flop7.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_flop7.ll
index fc1bd85..02e18d6 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_flop7.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_flop7.ll
@@ -5,7 +5,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "x86_64-apple-macosx10.8.0"
 
 ; Function Attrs: nounwind ssp uwtable
-define void @main(i1 %arg) #0 {
+define void @main(i1 %arg) {
 ; CHECK-LABEL: @main(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 %arg, label [[WHILE_BODY:%.*]], label [[WHILE_END:%.*]]
@@ -73,5 +73,3 @@ for.body267:                                      ; preds = %for.body267, %for.b
 for.end300:                                       ; preds = %for.body267, %for.end80
   unreachable
 }
-
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/debug_info.ll b/llvm/test/Transforms/SLPVectorizer/X86/debug_info.ll
index f98a569..686befe 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/debug_info.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/debug_info.ll
@@ -13,7 +13,7 @@ target triple = "x86_64-apple-macosx10.7.0"
 ;   A[8] = y0; A[8+1] = y1;
 ; }
 
-define i32 @depth(ptr nocapture %A, i32 %m) #0 !dbg !4 {
+define i32 @depth(ptr nocapture %A, i32 %m) !dbg !4 {
 ; CHECK-LABEL: @depth(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:      #dbg_value(ptr [[A:%.*]], [[META12:![0-9]+]], !DIExpression(), [[META18:![0-9]+]])
@@ -60,10 +60,7 @@ for.end:                                          ; preds = %for.body.lr.ph, %en
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
-
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!18, !32}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr16899.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr16899.ll
index 1b76ee9..4d18bf8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr16899.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr16899.ll
@@ -6,7 +6,7 @@ target triple = "i386--netbsd"
 @a = common global ptr null, align 4
 
 ; Function Attrs: noreturn nounwind readonly
-define i32 @fn1() #0 {
+define i32 @fn1() {
 ; CHECK-LABEL: define i32 @fn1(
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
@@ -37,8 +37,6 @@ do.body:                                          ; preds = %do.body, %entry
   br label %do.body
 }
 
-attributes #0 = { noreturn nounwind readonly "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !0 = !{!"any pointer", !1}
 !1 = !{!"omnipotent char", !2}
 !2 = !{!"Simple C/C++ TBAA"}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vector_gep.ll b/llvm/test/Transforms/SLPVectorizer/X86/vector_gep.ll
index 9e8cdc6..538751f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vector_gep.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vector_gep.ll
@@ -8,7 +8,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; The GEP has scalar and vector parameters and returns vector of pointers.
 
 ; Function Attrs: noreturn readonly uwtable
-define void @_Z3fn1v(i32 %x, <16 x ptr>%y) local_unnamed_addr #0 {
+define void @_Z3fn1v(i32 %x, <16 x ptr>%y) local_unnamed_addr {
 ; CHECK-LABEL: @_Z3fn1v(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CONV42_LE:%.*]] = sext i32 [[X:%.*]] to i64
@@ -25,6 +25,3 @@ entry:
   %VectorGep208 = getelementptr i32, <16 x ptr> %y, i64 %conv42.le
   unreachable
 }
-
-attributes #0 = { noreturn readonly uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="broadwell" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512f,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll
index 79bef36..8077595 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll
@@ -11,7 +11,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; the method implementation and it is not guaranteed that the best option
 ; encountered first (like here).
 
-define double @root_selection(double %a, double %b, double %c, double %d) local_unnamed_addr #0 {
+define double @root_selection(double %a, double %b, double %c, double %d) local_unnamed_addr {
 ; CHECK-LABEL: @root_selection(
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[B:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A:%.*]], i32 1
@@ -53,5 +53,3 @@ define double @root_selection(double %a, double %b, double %c, double %d) local_
   %i18 = fadd fast double %i17, %d
   ret double %i18
 }
-
-attributes #0 = { "unsafe-fp-math"="true" }
diff --git a/llvm/test/Transforms/SLPVectorizer/consecutive-access.ll b/llvm/test/Transforms/SLPVectorizer/consecutive-access.ll
index 369ca28..bdc09ed 100644
--- a/llvm/test/Transforms/SLPVectorizer/consecutive-access.ll
+++ b/llvm/test/Transforms/SLPVectorizer/consecutive-access.ll
@@ -8,7 +8,7 @@
 @D = common global [2000 x float] zeroinitializer, align 16
 
 ; Function Attrs: nounwind ssp uwtable
-define void @foo_3double(i32 %u) #0 {
+define void @foo_3double(i32 %u) {
 ; CHECK-LABEL: @foo_3double(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[U_ADDR:%.*]] = alloca i32, align 4
@@ -65,7 +65,7 @@ entry:
 ; A[C1 + C2*i] are consecutive, if C2 is a power of 2, and C2 > C1 > 0.
 ; Thus, the following code should be vectorized.
 ; Function Attrs: nounwind ssp uwtable
-define void @foo_2double(i32 %u) #0 {
+define void @foo_2double(i32 %u) {
 ; CHECK-LABEL: @foo_2double(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[U_ADDR:%.*]] = alloca i32, align 4
@@ -104,7 +104,7 @@ entry:
 
 ; Similar to the previous test, but with different datatype.
 ; Function Attrs: nounwind ssp uwtable
-define void @foo_4float(i32 %u) #0 {
+define void @foo_4float(i32 %u) {
 ; CHECK-LABEL: @foo_4float(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[U_ADDR:%.*]] = alloca i32, align 4
@@ -159,7 +159,7 @@ entry:
 
 ; Similar to the previous tests, but now we are dealing with AddRec SCEV.
 ; Function Attrs: nounwind ssp uwtable
-define i32 @foo_loop(ptr %A, i32 %n) #0 {
+define i32 @foo_loop(ptr %A, i32 %n) {
 ; CHECK-LABEL: @foo_loop(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
@@ -248,7 +248,7 @@ for.end:                                          ; preds = %for.cond.for.end_cr
 ; Similar to foo_2double but with a non-power-of-2 factor and potential
 ; wrapping (both indices wrap or both don't in the same time)
 ; Function Attrs: nounwind ssp uwtable
-define void @foo_2double_non_power_of_2(i32 %u) #0 {
+define void @foo_2double_non_power_of_2(i32 %u) {
 ; CHECK-LABEL: @foo_2double_non_power_of_2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[U_ADDR:%.*]] = alloca i32, align 4
@@ -289,7 +289,7 @@ entry:
 
 ; Similar to foo_2double_non_power_of_2 but with zext's instead of sext's
 ; Function Attrs: nounwind ssp uwtable
-define void @foo_2double_non_power_of_2_zext(i32 %u) #0 {
+define void @foo_2double_non_power_of_2_zext(i32 %u) {
 ; CHECK-LABEL: @foo_2double_non_power_of_2_zext(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[U_ADDR:%.*]] = alloca i32, align 4
@@ -332,7 +332,7 @@ entry:
 ; Alternatively, this is like foo_loop, but with a non-power-of-2 factor and
 ; potential wrapping (both indices wrap or both don't in the same time)
 ; Function Attrs: nounwind ssp uwtable
-define i32 @foo_loop_non_power_of_2(ptr %A, i32 %n) #0 {
+define i32 @foo_loop_non_power_of_2(ptr %A, i32 %n) {
 ; CHECK-LABEL: @foo_loop_non_power_of_2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
@@ -438,7 +438,7 @@ for.end:                                          ; preds = %for.cond.for.end_cr
 ;
 ; Make sure we are able to vectorize this from now on:
 ;
-define double @bar(ptr nocapture readonly %a, i32 %n) local_unnamed_addr #0 {
+define double @bar(ptr nocapture readonly %a, i32 %n) local_unnamed_addr {
 ; CHECK-X86-LABEL: @bar(
 ; CHECK-X86-NEXT:  entry:
 ; CHECK-X86-NEXT:    [[CMP15:%.*]] = icmp eq i32 [[N:%.*]], 0
@@ -548,8 +548,6 @@ define void @store_constant_expression(ptr %p) {
   ret void
 }
 
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.5.0 "}
diff --git a/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector-inseltpoison.ll
index 74e52da..385df87 100644
--- a/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector-inseltpoison.ll
@@ -6,7 +6,7 @@
 ; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -slp-threshold=0 -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,NOTHRESHOLD %}
 ; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -slp-threshold=-10000 -slp-min-tree-size=0 -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,MINTREESIZE %}
 
-define <4 x float> @simple_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; CHECK-LABEL: @simple_select(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
@@ -42,7 +42,7 @@ define <4 x float> @simple_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c)
 declare void @llvm.assume(i1) nounwind
 
 ; This entire tree is ephemeral, don't vectorize any of it.
-define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; THRESHOLD-LABEL: @simple_select_eph(
 ; THRESHOLD-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
 ; THRESHOLD-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
@@ -199,7 +199,7 @@ define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32>
 
 ; Insert in an order different from the vector indices to make sure it
 ; doesn't matter
-define <4 x float> @simple_select_insert_out_of_order(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_select_insert_out_of_order(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; CHECK-LABEL: @simple_select_insert_out_of_order(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
@@ -233,15 +233,15 @@ define <4 x float> @simple_select_insert_out_of_order(<4 x float> %a, <4 x float
   ret <4 x float> %rd
 }
 
-declare void @v4f32_user(<4 x float>) #0
-declare void @f32_user(float) #0
+declare void @v4f32_user(<4 x float>)
+declare void @f32_user(float)
 
 ; Multiple users of the final constructed vector
-define <4 x float> @simple_select_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_select_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; CHECK-LABEL: @simple_select_users(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
-; CHECK-NEXT:    call void @v4f32_user(<4 x float> [[TMP2]]) #[[ATTR0:[0-9]+]]
+; CHECK-NEXT:    call void @v4f32_user(<4 x float> [[TMP2]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP2]]
 ;
   %c0 = extractelement <4 x i32> %c, i32 0
@@ -268,12 +268,12 @@ define <4 x float> @simple_select_users(<4 x float> %a, <4 x float> %b, <4 x i32
   %rb = insertelement <4 x float> %ra, float %s1, i32 1
   %rc = insertelement <4 x float> %rb, float %s2, i32 2
   %rd = insertelement <4 x float> %rc, float %s3, i32 3
-  call void @v4f32_user(<4 x float> %rd) #0
+  call void @v4f32_user(<4 x float> %rd)
   ret <4 x float> %rd
 }
 
 ; Unused insertelement
-define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; CHECK-LABEL: @simple_select_no_users(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[C:%.*]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer
@@ -319,7 +319,7 @@ define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x
 
 ; Make sure infinite loop doesn't happen which I ran into when trying
 ; to do this backwards this backwards
-define <4 x i32> @reconstruct(<4 x i32> %c) #0 {
+define <4 x i32> @reconstruct(<4 x i32> %c) {
 ; CHECK-LABEL: @reconstruct(
 ; CHECK-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
 ; CHECK-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
@@ -342,7 +342,7 @@ define <4 x i32> @reconstruct(<4 x i32> %c) #0 {
   ret <4 x i32> %rd
 }
 
-define <2 x float> @simple_select_v2(<2 x float> %a, <2 x float> %b, <2 x i32> %c) #0 {
+define <2 x float> @simple_select_v2(<2 x float> %a, <2 x float> %b, <2 x i32> %c) {
 ; CHECK-LABEL: @simple_select_v2(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <2 x i32> [[C:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x float> [[A:%.*]], <2 x float> [[B:%.*]]
@@ -366,7 +366,7 @@ define <2 x float> @simple_select_v2(<2 x float> %a, <2 x float> %b, <2 x i32> %
 ; Make sure when we construct partial vectors, we don't keep
 ; re-visiting the insertelement chains starting with undef
 ; (low cost threshold needed to force this to happen)
-define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; CHECK-LABEL: @simple_select_partial_vector(
 ; CHECK-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
 ; CHECK-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
@@ -487,7 +487,7 @@ define <4 x double> @multi_tree(double %w, double %x, double %y, double %z) {
   ret <4 x double> %i4
 }
 
-define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr #0 {
+define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr {
 ; CHECK-LABEL: @_vadd256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <8 x float> [[TMP1]]
@@ -526,5 +526,3 @@ define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr
   %vecinit7.i = insertelement <8 x float> %vecinit6.i, float %add22, i32 7
   ret <8 x float> %vecinit7.i
 }
-
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector.ll
index 0b896f4..37c02d6 100644
--- a/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector.ll
@@ -6,7 +6,7 @@
 ; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -slp-threshold=0 -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,NOTHRESHOLD %}
 ; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -slp-threshold=-10000 -slp-min-tree-size=0 -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,MINTREESIZE %}
 
-define <4 x float> @simple_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; CHECK-LABEL: @simple_select(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
@@ -39,7 +39,7 @@ define <4 x float> @simple_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c)
   ret <4 x float> %rd
 }
 
-define <8 x float> @simple_select2(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <8 x float> @simple_select2(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; CHECK-LABEL: @simple_select2(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
@@ -77,7 +77,7 @@ define <8 x float> @simple_select2(<4 x float> %a, <4 x float> %b, <4 x i32> %c)
 declare void @llvm.assume(i1) nounwind
 
 ; This entire tree is ephemeral, don't vectorize any of it.
-define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; THRESHOLD-LABEL: @simple_select_eph(
 ; THRESHOLD-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
 ; THRESHOLD-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
@@ -234,7 +234,7 @@ define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32>
 
 ; Insert in an order different from the vector indices to make sure it
 ; doesn't matter
-define <4 x float> @simple_select_insert_out_of_order(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_select_insert_out_of_order(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; CHECK-LABEL: @simple_select_insert_out_of_order(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
@@ -268,15 +268,15 @@ define <4 x float> @simple_select_insert_out_of_order(<4 x float> %a, <4 x float
   ret <4 x float> %rd
 }
 
-declare void @v4f32_user(<4 x float>) #0
-declare void @f32_user(float) #0
+declare void @v4f32_user(<4 x float>)
+declare void @f32_user(float)
 
 ; Multiple users of the final constructed vector
-define <4 x float> @simple_select_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_select_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; CHECK-LABEL: @simple_select_users(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
-; CHECK-NEXT:    call void @v4f32_user(<4 x float> [[TMP2]]) #[[ATTR0:[0-9]+]]
+; CHECK-NEXT:    call void @v4f32_user(<4 x float> [[TMP2]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP2]]
 ;
   %c0 = extractelement <4 x i32> %c, i32 0
@@ -303,12 +303,12 @@ define <4 x float> @simple_select_users(<4 x float> %a, <4 x float> %b, <4 x i32
   %rb = insertelement <4 x float> %ra, float %s1, i32 1
   %rc = insertelement <4 x float> %rb, float %s2, i32 2
   %rd = insertelement <4 x float> %rc, float %s3, i32 3
-  call void @v4f32_user(<4 x float> %rd) #0
+  call void @v4f32_user(<4 x float> %rd)
   ret <4 x float> %rd
 }
 
 ; Unused insertelement
-define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; CHECK-LABEL: @simple_select_no_users(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[C:%.*]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer
@@ -355,7 +355,7 @@ define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x
 
 ; Make sure infinite loop doesn't happen which I ran into when trying
 ; to do this backwards this backwards
-define <4 x i32> @reconstruct(<4 x i32> %c) #0 {
+define <4 x i32> @reconstruct(<4 x i32> %c) {
 ; CHECK-LABEL: @reconstruct(
 ; CHECK-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
 ; CHECK-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
@@ -378,7 +378,7 @@ define <4 x i32> @reconstruct(<4 x i32> %c) #0 {
   ret <4 x i32> %rd
 }
 
-define <2 x float> @simple_select_v2(<2 x float> %a, <2 x float> %b, <2 x i32> %c) #0 {
+define <2 x float> @simple_select_v2(<2 x float> %a, <2 x float> %b, <2 x i32> %c) {
 ; CHECK-LABEL: @simple_select_v2(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <2 x i32> [[C:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x float> [[A:%.*]], <2 x float> [[B:%.*]]
@@ -402,7 +402,7 @@ define <2 x float> @simple_select_v2(<2 x float> %a, <2 x float> %b, <2 x i32> %
 ; Make sure when we construct partial vectors, we don't keep
 ; re-visiting the insertelement chains starting with zeroinitializer
 ; (low cost threshold needed to force this to happen)
-define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; CHECK-LABEL: @simple_select_partial_vector(
 ; CHECK-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
 ; CHECK-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
@@ -523,7 +523,7 @@ define <4 x double> @multi_tree(double %w, double %x, double %y, double %z) {
   ret <4 x double> %i4
 }
 
-define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr #0 {
+define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr {
 ; CHECK-LABEL: @_vadd256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <8 x float> [[TMP1]]
@@ -562,5 +562,3 @@ define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr
   %vecinit7.i = insertelement <8 x float> %vecinit6.i, float %add22, i32 7
   ret <8 x float> %vecinit7.i
 }
-
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/SROA/mem-par-metadata-sroa.ll b/llvm/test/Transforms/SROA/mem-par-metadata-sroa.ll
index bc05971..82ebdaa 100644
--- a/llvm/test/Transforms/SROA/mem-par-metadata-sroa.ll
+++ b/llvm/test/Transforms/SROA/mem-par-metadata-sroa.ll
@@ -33,7 +33,6 @@
 ;     }
 ; }
 
-
 ; ModuleID = '<stdin>'
 source_filename = "mem-par-metadata-sroa1.cpp"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -42,7 +41,7 @@ target triple = "x86_64-unknown-linux-gnu"
 %class.Complex = type { float, float }
 
 ; Function Attrs: norecurse nounwind uwtable
-define void @_Z4testP7Complexl(ptr nocapture %out, i64 %size) local_unnamed_addr #0 {
+define void @_Z4testP7Complexl(ptr nocapture %out, i64 %size) local_unnamed_addr {
 ; CHECK-LABEL: @_Z4testP7Complexl(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
@@ -108,10 +107,7 @@ for.end:                                          ; preds = %for.cond
 }
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1) #1
-
-attributes #0 = { norecurse nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
+declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1)
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/Transforms/SafeStack/AArch64/abi_ssp.ll b/llvm/test/Transforms/SafeStack/AArch64/abi_ssp.ll
index 43fb260..d981626 100644
--- a/llvm/test/Transforms/SafeStack/AArch64/abi_ssp.ll
+++ b/llvm/test/Transforms/SafeStack/AArch64/abi_ssp.ll
@@ -1,7 +1,5 @@
 ; RUN: opt -safe-stack -S -mtriple=aarch64-linux-android < %s -o - | FileCheck --check-prefixes=TLS,ANDROID %s
-; RUN: opt -safe-stack -S -mtriple=aarch64-unknown-fuchsia < %s -o - | FileCheck --check-prefixes=TLS,FUCHSIA %s
 ; RUN: opt -passes=safe-stack -S -mtriple=aarch64-linux-android < %s -o - | FileCheck --check-prefixes=TLS,ANDROID %s
-; RUN: opt -passes=safe-stack -S -mtriple=aarch64-unknown-fuchsia < %s -o - | FileCheck --check-prefixes=TLS,FUCHSIA %s
 
 define void @foo() nounwind uwtable safestack sspreq {
 entry:
@@ -10,7 +8,6 @@ entry:
 
 ; TLS: %[[TP2:.*]] = call ptr @llvm.thread.pointer.p0()
 ; ANDROID: %[[B:.*]] = getelementptr i8, ptr %[[TP2]], i32 40
-; FUCHSIA: %[[B:.*]] = getelementptr i8, ptr %[[TP2]], i32 -16
 ; TLS: %[[StackGuard:.*]] = load ptr, ptr %[[B]]
 ; TLS: store ptr %[[StackGuard]], ptr %[[StackGuardSlot:.*]]
   %a = alloca i128, align 16
diff --git a/llvm/test/Transforms/SafeStack/ARM/debug.ll b/llvm/test/Transforms/SafeStack/ARM/debug.ll
index 207475a..dfce13a 100644
--- a/llvm/test/Transforms/SafeStack/ARM/debug.ll
+++ b/llvm/test/Transforms/SafeStack/ARM/debug.ll
@@ -42,16 +42,15 @@ declare void @llvm.lifetime.start.p0(ptr nocapture) #2
 ; Function Attrs: nounwind readnone speculatable
 declare void @llvm.dbg.declare(metadata, metadata, metadata) #3
 
-declare void @Capture(ptr) local_unnamed_addr #4
+declare void @Capture(ptr) local_unnamed_addr
 
 ; Function Attrs: argmemonly nounwind
 declare void @llvm.lifetime.end.p0(ptr nocapture) #2
 
-attributes #0 = { norecurse nounwind readonly safestack "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv7-a,+dsp,+neon,+vfp3,-thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind safestack "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv7-a,+dsp,+neon,+vfp3,-thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind readonly safestack }
+attributes #1 = { nounwind safestack }
 attributes #2 = { argmemonly nounwind }
 attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv7-a,+dsp,+neon,+vfp3,-thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #5 = { nounwind }
 
 !llvm.dbg.cu = !{!2}
diff --git a/llvm/test/Transforms/SafeStack/X86/debug-loc.ll b/llvm/test/Transforms/SafeStack/X86/debug-loc.ll
index fcb4935..92197a7 100644
--- a/llvm/test/Transforms/SafeStack/X86/debug-loc.ll
+++ b/llvm/test/Transforms/SafeStack/X86/debug-loc.ll
@@ -44,11 +44,10 @@ entry:
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
-declare void @Capture(ptr) #2
+declare void @Capture(ptr)
 
-attributes #0 = { safestack uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { safestack uwtable }
 attributes #1 = { nounwind readnone }
-attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!15, !16}
diff --git a/llvm/test/Transforms/SafeStack/X86/debug-loc2.ll b/llvm/test/Transforms/SafeStack/X86/debug-loc2.ll
index e60522f..634231d 100644
--- a/llvm/test/Transforms/SafeStack/X86/debug-loc2.ll
+++ b/llvm/test/Transforms/SafeStack/X86/debug-loc2.ll
@@ -45,7 +45,7 @@ entry:
 ; Function Attrs: argmemonly nounwind
 declare void @llvm.lifetime.start.p0(ptr nocapture) #1
 
-declare void @capture(ptr) #2
+declare void @capture(ptr)
 
 ; Function Attrs: argmemonly nounwind
 declare void @llvm.lifetime.end.p0(ptr nocapture) #1
@@ -55,9 +55,8 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) #3
 
 declare void @llvm.random.metadata.use(metadata)
 
-attributes #0 = { noinline safestack uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline safestack uwtable }
 attributes #1 = { argmemonly nounwind }
-attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #3 = { nounwind readnone }
 attributes #4 = { nounwind }
 
diff --git a/llvm/test/Transforms/SampleProfile/Inputs/profile-symbol-list.ll b/llvm/test/Transforms/SampleProfile/Inputs/profile-symbol-list.ll
index b8e64e8..a48d46d 100644
--- a/llvm/test/Transforms/SampleProfile/Inputs/profile-symbol-list.ll
+++ b/llvm/test/Transforms/SampleProfile/Inputs/profile-symbol-list.ll
@@ -90,10 +90,10 @@ while.end:                                        ; preds = %while.body
 ; Function Attrs: nofree nounwind
 declare dso_local i32 @printf(ptr nocapture readonly, ...) local_unnamed_addr #3
 
-attributes #0 = { noinline norecurse nounwind readnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { norecurse nounwind readnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #2 = { nofree norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #3 = { nofree nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { noinline norecurse nounwind readnone uwtable "use-sample-profile" }
+attributes #1 = { norecurse nounwind readnone uwtable "use-sample-profile" }
+attributes #2 = { nofree norecurse nounwind uwtable "use-sample-profile" }
+attributes #3 = { nofree nounwind "use-sample-profile" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/Transforms/SampleProfile/branch.ll b/llvm/test/Transforms/SampleProfile/branch.ll
index ff5d8bb..83f9354 100644
--- a/llvm/test/Transforms/SampleProfile/branch.ll
+++ b/llvm/test/Transforms/SampleProfile/branch.ll
@@ -62,7 +62,7 @@ if.end:                                           ; preds = %entry
   %1 = load ptr, ptr %argv.addr, align 8, !dbg !30
   %arrayidx = getelementptr inbounds ptr, ptr %1, i64 1, !dbg !30
   %2 = load ptr, ptr %arrayidx, align 8, !dbg !30
-  %call = call i32 @atoi(ptr %2) #4, !dbg !31
+  %call = call i32 @atoi(ptr %2), !dbg !31
   store i32 %call, ptr %limit, align 4, !dbg !29
   %3 = load i32, ptr %limit, align 4, !dbg !32
   %cmp1 = icmp sgt i32 %3, 100, !dbg !34
@@ -75,7 +75,7 @@ if.then.2:                                        ; preds = %if.end
   %4 = load ptr, ptr %argv.addr, align 8, !dbg !39
   %arrayidx3 = getelementptr inbounds ptr, ptr %4, i64 2, !dbg !39
   %5 = load ptr, ptr %arrayidx3, align 8, !dbg !39
-  %call4 = call i32 @atoi(ptr %5) #4, !dbg !40
+  %call4 = call i32 @atoi(ptr %5), !dbg !40
   %conv = sitofp i32 %call4 to double, !dbg !40
   %mul = fmul double 0x40370ABE6A337A81, %conv, !dbg !41
   store double %mul, ptr %s, align 8, !dbg !38
@@ -128,7 +128,7 @@ if.else:                                          ; preds = %if.end
   %16 = load ptr, ptr %argv.addr, align 8, !dbg !72
   %arrayidx10 = getelementptr inbounds ptr, ptr %16, i64 2, !dbg !72
   %17 = load ptr, ptr %arrayidx10, align 8, !dbg !72
-  %call11 = call i32 @atoi(ptr %17) #4, !dbg !74
+  %call11 = call i32 @atoi(ptr %17), !dbg !74
   %conv12 = sitofp i32 %call11 to double, !dbg !74
   store double %conv12, ptr %result, align 8, !dbg !75
   br label %if.end.13
@@ -145,18 +145,14 @@ return:                                           ; preds = %if.end.13, %if.then
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: nounwind readonly
-declare i32 @atoi(ptr) #2
+declare i32 @atoi(ptr)
 
-declare i32 @printf(ptr, ...) #3
+declare i32 @printf(ptr, ...)
 
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readonly "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind readonly }
+attributes #0 = { uwtable "use-sample-profile" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!13, !14}
diff --git a/llvm/test/Transforms/SampleProfile/csspgo-import-list.ll b/llvm/test/Transforms/SampleProfile/csspgo-import-list.ll
index 077eab7..ade2d73 100644
--- a/llvm/test/Transforms/SampleProfile/csspgo-import-list.ll
+++ b/llvm/test/Transforms/SampleProfile/csspgo-import-list.ll
@@ -56,7 +56,7 @@ for.body:                                         ; preds = %for.body, %entry
 ; THRESHOLD-REPLAY: !{!"function_entry_count", i64 1, i64 446061515086924981, i64 3815895320998406042, i64 6309742469962978389, i64 7102633082150537521, i64 -2862076748587597320, i64 -2016976694713209516}
 ; THRESHOLD-REPLAY-NO-FUNCA: !{!"function_entry_count", i64 1, i64 446061515086924981, i64 3815895320998406042, i64 6309742469962978389, i64 7102633082150537521, i64 -2862076748587597320}
 
-attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nofree noinline norecurse nounwind uwtable "use-sample-profile" }
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/Transforms/SampleProfile/csspgo-inline-debug.ll b/llvm/test/Transforms/SampleProfile/csspgo-inline-debug.ll
index fe31023..de51afd 100644
--- a/llvm/test/Transforms/SampleProfile/csspgo-inline-debug.ll
+++ b/llvm/test/Transforms/SampleProfile/csspgo-inline-debug.ll
@@ -88,8 +88,8 @@ entry:
 
 declare i32 @_Z3fibi(i32)
 
-attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { nofree norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nofree noinline norecurse nounwind uwtable "use-sample-profile" }
+attributes #1 = { nofree norecurse nounwind uwtable "use-sample-profile" }
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/Transforms/SampleProfile/csspgo-inline.ll b/llvm/test/Transforms/SampleProfile/csspgo-inline.ll
index 177329f..deabc27 100644
--- a/llvm/test/Transforms/SampleProfile/csspgo-inline.ll
+++ b/llvm/test/Transforms/SampleProfile/csspgo-inline.ll
@@ -109,8 +109,8 @@ entry:
 
 declare i32 @_Z3fibi(i32)
 
-attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { nofree norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nofree noinline norecurse nounwind uwtable "use-sample-profile" }
+attributes #1 = { nofree norecurse nounwind uwtable "use-sample-profile" }
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/Transforms/SampleProfile/csspgo-summary.ll b/llvm/test/Transforms/SampleProfile/csspgo-summary.ll
index f18425e..3daa69e 100644
--- a/llvm/test/Transforms/SampleProfile/csspgo-summary.ll
+++ b/llvm/test/Transforms/SampleProfile/csspgo-summary.ll
@@ -75,8 +75,8 @@ entry:
 
 declare i32 @_Z3fibi(i32)
 
-attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { nofree norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nofree noinline norecurse nounwind uwtable "use-sample-profile" }
+attributes #1 = { nofree norecurse nounwind uwtable "use-sample-profile" }
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/Transforms/SampleProfile/csspgo-use-preinliner.ll b/llvm/test/Transforms/SampleProfile/csspgo-use-preinliner.ll
index 030b5aa..e4ee939 100644
--- a/llvm/test/Transforms/SampleProfile/csspgo-use-preinliner.ll
+++ b/llvm/test/Transforms/SampleProfile/csspgo-use-preinliner.ll
@@ -87,8 +87,8 @@ entry:
 
 declare i32 @_Z3fibi(i32)
 
-attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { nofree norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nofree noinline norecurse nounwind uwtable "use-sample-profile" }
+attributes #1 = { nofree norecurse nounwind uwtable "use-sample-profile" }
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/Transforms/SampleProfile/entry_counts_cold.ll b/llvm/test/Transforms/SampleProfile/entry_counts_cold.ll
index c7617c1..4aacde8 100644
--- a/llvm/test/Transforms/SampleProfile/entry_counts_cold.ll
+++ b/llvm/test/Transforms/SampleProfile/entry_counts_cold.ll
@@ -91,12 +91,11 @@ declare void @llvm.lifetime.start.p0(ptr nocapture) #2
 ; Function Attrs: argmemonly nounwind
 declare void @llvm.lifetime.end.p0(ptr nocapture) #2
 
-declare void @baz(...) #3
+declare void @baz(...)
 
-attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nounwind ssp uwtable "use-sample-profile" }
 attributes #1 = { nounwind readnone speculatable }
 attributes #2 = { argmemonly nounwind }
-attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #4 = { nounwind }
 
 !llvm.dbg.cu = !{!0}
diff --git a/llvm/test/Transforms/SampleProfile/entry_counts_missing_dbginfo.ll b/llvm/test/Transforms/SampleProfile/entry_counts_missing_dbginfo.ll
index 0e62921..e0cd883 100644
--- a/llvm/test/Transforms/SampleProfile/entry_counts_missing_dbginfo.ll
+++ b/llvm/test/Transforms/SampleProfile/entry_counts_missing_dbginfo.ll
@@ -101,12 +101,11 @@ declare void @llvm.lifetime.start.p0(ptr nocapture) #2
 ; Function Attrs: argmemonly nounwind
 declare void @llvm.lifetime.end.p0(ptr nocapture) #2
 
-declare void @baz(...) #3
+declare void @baz(...)
 
-attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nounwind ssp uwtable "use-sample-profile" }
 attributes #1 = { nounwind readnone speculatable }
 attributes #2 = { argmemonly nounwind }
-attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #4 = { nounwind }
 
 !llvm.dbg.cu = !{!0}
diff --git a/llvm/test/Transforms/SampleProfile/fsafdo_test.ll b/llvm/test/Transforms/SampleProfile/fsafdo_test.ll
index 4a35cb1..8cc5d06 100644
--- a/llvm/test/Transforms/SampleProfile/fsafdo_test.ll
+++ b/llvm/test/Transforms/SampleProfile/fsafdo_test.ll
@@ -6,7 +6,7 @@ target triple = "x86_64-unknown-linux-gnu"
 @sum = dso_local local_unnamed_addr global i32 0, align 4
 
 declare i32 @bar(i32 %i) #0
-declare void @work(i32 %i) #2
+declare void @work(i32 %i)
 
 define dso_local void @foo() #0 !dbg !29 {
 ; CHECK: Printing analysis {{.*}} for function 'foo':
@@ -141,7 +141,7 @@ if.end9.3:
 ; CHECK: edge %if.end9.3 -> %for.cond1.preheader probability is 0x7f7cb227 / 0x80000000 = 99.60% [HOT edge]
 }
 
-define dso_local i32 @main() #3 !dbg !52 {
+define dso_local i32 @main() !dbg !52 {
 entry:
   br label %for.body, !dbg !53
 
@@ -157,10 +157,8 @@ for.end:
 }
 
 
-attributes #0 = { noinline nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile"}
+attributes #0 = { noinline nounwind uwtable "use-sample-profile"}
 attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { nofree noinline norecurse nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/Transforms/SampleProfile/gcc-simple.ll b/llvm/test/Transforms/SampleProfile/gcc-simple.ll
index 315ac5a..2496d18 100644
--- a/llvm/test/Transforms/SampleProfile/gcc-simple.ll
+++ b/llvm/test/Transforms/SampleProfile/gcc-simple.ll
@@ -31,7 +31,7 @@ entry:
   %i.addr = alloca i64, align 8
   store i64 %i, ptr %i.addr, align 8
   call void @llvm.dbg.declare(metadata ptr %i.addr, metadata !16, metadata !17), !dbg !18
-  %call = call i32 @rand() #3, !dbg !19
+  %call = call i32 @rand(), !dbg !19
 ; CHECK: !prof ![[PROF1:[0-9]+]]
   %cmp = icmp slt i32 %call, 500, !dbg !21
   br i1 %cmp, label %if.then, label %if.else, !dbg !22
@@ -42,7 +42,7 @@ if.then:                                          ; preds = %entry
   br label %return, !dbg !23
 
 if.else:                                          ; preds = %entry
-  %call1 = call i32 @rand() #3, !dbg !25
+  %call1 = call i32 @rand(), !dbg !25
 ; CHECK: !prof ![[PROF3:[0-9]+]]
   %cmp2 = icmp sgt i32 %call1, 5000, !dbg !28
   br i1 %cmp2, label %if.then.3, label %if.else.4, !dbg !29
@@ -62,10 +62,10 @@ return:                                           ; preds = %if.else.4, %if.then
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: nounwind
-declare i32 @rand() #2
+declare i32 @rand()
 
 ; Function Attrs: nounwind uwtable
 define i32 @main() #0 !dbg !9 {
@@ -141,10 +141,7 @@ for.end.6:                                        ; preds = %for.cond
 ; CHECK: ![[PROF7]] = !{!"branch_weights", i32 18943, i32 1}
 ; CHECK: ![[PROF8]] = !{!"branch_weights", i32 18942}
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+attributes #0 = { nounwind uwtable "use-sample-profile" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!13, !14}
diff --git a/llvm/test/Transforms/SampleProfile/inline-act.ll b/llvm/test/Transforms/SampleProfile/inline-act.ll
index 3ab3efd..e962642 100644
--- a/llvm/test/Transforms/SampleProfile/inline-act.ll
+++ b/llvm/test/Transforms/SampleProfile/inline-act.ll
@@ -24,7 +24,7 @@ target triple = "x86_64-unknown-linux-gnu"
 @t = global i32 0, align 4
 
 ; Function Attrs: nounwind uwtable
-define zeroext i1 @_Z3fooi(i32) #0 {
+define zeroext i1 @_Z3fooi(i32) {
   %switch.tableidx = sub i32 %0, 0
   %2 = icmp ult i32 %switch.tableidx, 4
   br i1 %2, label %switch.lookup, label %3
@@ -41,7 +41,7 @@ switch.lookup:                                    ; preds = %1
 }
 
 ; Function Attrs: nounwind uwtable
-define void @_Z3bari(i32) #0 !dbg !9 {
+define void @_Z3bari(i32) !dbg !9 {
   %2 = call zeroext i1 @_Z3fooi(i32 %0), !dbg !10
   br i1 %2, label %3, label %6, !dbg !10
 
@@ -55,8 +55,6 @@ define void @_Z3bari(i32) #0 !dbg !9 {
   ret void
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3}
 !llvm.ident = !{!4}
diff --git a/llvm/test/Transforms/SampleProfile/misexpect.ll b/llvm/test/Transforms/SampleProfile/misexpect.ll
index 26f76ee5..a7667fc 100644
--- a/llvm/test/Transforms/SampleProfile/misexpect.ll
+++ b/llvm/test/Transforms/SampleProfile/misexpect.ll
@@ -68,7 +68,7 @@ if.end:                                           ; preds = %entry
   %1 = load ptr, ptr %argv.addr, align 8, !dbg !30
   %arrayidx = getelementptr inbounds ptr, ptr %1, i64 1, !dbg !30
   %2 = load ptr, ptr %arrayidx, align 8, !dbg !30
-  %call = call i32 @atoi(ptr %2) #4, !dbg !31
+  %call = call i32 @atoi(ptr %2), !dbg !31
   store i32 %call, ptr %limit, align 4, !dbg !29
   %3 = load i32, ptr %limit, align 4, !dbg !32
   %exp = call i32 @llvm.expect.i32(i32 %3, i32 0)
@@ -80,7 +80,7 @@ if.then.2:                                        ; preds = %if.end
   %4 = load ptr, ptr %argv.addr, align 8, !dbg !39
   %arrayidx3 = getelementptr inbounds ptr, ptr %4, i64 2, !dbg !39
   %5 = load ptr, ptr %arrayidx3, align 8, !dbg !39
-  %call4 = call i32 @atoi(ptr %5) #4, !dbg !40
+  %call4 = call i32 @atoi(ptr %5), !dbg !40
   %conv = sitofp i32 %call4 to double, !dbg !40
   %mul = fmul double 0x40370ABE6A337A81, %conv, !dbg !41
   store double %mul, ptr %s, align 8, !dbg !38
@@ -130,7 +130,7 @@ if.else:                                          ; preds = %if.end
   %16 = load ptr, ptr %argv.addr, align 8, !dbg !72
   %arrayidx10 = getelementptr inbounds ptr, ptr %16, i64 2, !dbg !72
   %17 = load ptr, ptr %arrayidx10, align 8, !dbg !72
-  %call11 = call i32 @atoi(ptr %17) #4, !dbg !74
+  %call11 = call i32 @atoi(ptr %17), !dbg !74
   %conv12 = sitofp i32 %call11 to double, !dbg !74
   store double %conv12, ptr %result, align 8, !dbg !75
   br label %if.end.13
@@ -147,23 +147,17 @@ return:                                           ; preds = %if.end.13, %if.then
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: nounwind readonly
-declare i32 @atoi(ptr) #2
+declare i32 @atoi(ptr)
 
-declare i32 @printf(ptr, ...) #3
+declare i32 @printf(ptr, ...)
 
 ; Function Attrs: nounwind readnone willreturn
-declare i32 @llvm.expect.i32(i32, i32) #5
+declare i32 @llvm.expect.i32(i32, i32)
 
-
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readonly "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind readonly }
-attributes #5 = { nounwind readnone willreturn }
+attributes #0 = { uwtable "use-sample-profile" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!13, !14}
diff --git a/llvm/test/Transforms/SampleProfile/norepeated-icp-2.ll b/llvm/test/Transforms/SampleProfile/norepeated-icp-2.ll
index 30a5198..5ce8670 100644
--- a/llvm/test/Transforms/SampleProfile/norepeated-icp-2.ll
+++ b/llvm/test/Transforms/SampleProfile/norepeated-icp-2.ll
@@ -17,7 +17,7 @@ entry:
 }
 
 ; Function Attrs: nofree nounwind
-declare dso_local noundef i32 @printf(ptr nocapture noundef readonly, ...) #1
+declare dso_local noundef i32 @printf(ptr nocapture noundef readonly, ...)
 
 ; Function Attrs: uwtable mustprogress
 define dso_local void @_Z3hoov() #0 !dbg !11 {
@@ -35,7 +35,7 @@ if.end:                                           ; preds = %if.then, %entry
   ret void, !dbg !22
 }
 
-declare !dbg !23 dso_local void @_Z10hoo_calleev() #2
+declare !dbg !23 dso_local void @_Z10hoo_calleev()
 
 ; MAX2-LABEL: @_Z3goov(
 ; MAX2: icmp eq ptr {{.*}} @_Z3hoov
@@ -78,12 +78,9 @@ entry:
 ; MAX4: ![[PROF_ID5]] = !{!"VP", i32 0, i64 13000, i64 4128940972712279918, i64 -1, i64 3137940972712279918, i64 -1, i64 2132940972712279918, i64 -1, i64 1850239051784516332, i64 -1}
 
 ; Function Attrs: nofree nounwind
-declare noundef i32 @puts(ptr nocapture noundef readonly) #3
+declare noundef i32 @puts(ptr nocapture noundef readonly)
 
-attributes #0 = { uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-sample-profile" "use-soft-float"="false" }
-attributes #1 = { nofree nounwind "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nofree nounwind }
+attributes #0 = { uwtable mustprogress "use-sample-profile" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/Transforms/SampleProfile/norepeated-icp-3.ll b/llvm/test/Transforms/SampleProfile/norepeated-icp-3.ll
index 9b9bbca4..c5645cd 100644
--- a/llvm/test/Transforms/SampleProfile/norepeated-icp-3.ll
+++ b/llvm/test/Transforms/SampleProfile/norepeated-icp-3.ll
@@ -34,11 +34,10 @@ entry:
 }
 
 ; Function Attrs: nofree nounwind
-declare noundef i32 @puts(ptr nocapture noundef readonly) #2
+declare noundef i32 @puts(ptr nocapture noundef readonly) #1
 
-attributes #0 = { uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-sample-profile" "use-soft-float"="false" }
-attributes #1 = { nofree nounwind "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nofree nounwind }
+attributes #0 = { uwtable mustprogress "use-sample-profile" }
+attributes #1 = { nofree nounwind }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/Transforms/SampleProfile/norepeated-icp-4.ll b/llvm/test/Transforms/SampleProfile/norepeated-icp-4.ll
index 57a2386..c418372 100644
--- a/llvm/test/Transforms/SampleProfile/norepeated-icp-4.ll
+++ b/llvm/test/Transforms/SampleProfile/norepeated-icp-4.ll
@@ -33,7 +33,7 @@ entry:
   ret void, !dbg !21
 }
 
-attributes #0 = { uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-sample-profile" "use-soft-float"="false" }
+attributes #0 = { uwtable mustprogress "use-sample-profile" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5, !25}
diff --git a/llvm/test/Transforms/SampleProfile/norepeated-icp.ll b/llvm/test/Transforms/SampleProfile/norepeated-icp.ll
index f3340ba..bf3cfe4 100644
--- a/llvm/test/Transforms/SampleProfile/norepeated-icp.ll
+++ b/llvm/test/Transforms/SampleProfile/norepeated-icp.ll
@@ -15,7 +15,7 @@ entry:
 }
 
 ; Function Attrs: nofree nounwind
-declare dso_local noundef i32 @printf(ptr nocapture noundef readonly, ...) #1
+declare dso_local noundef i32 @printf(ptr nocapture noundef readonly, ...)
 
 ; Function Attrs: uwtable mustprogress
 define dso_local void @_Z3goov() #0 !dbg !11 {
@@ -40,11 +40,9 @@ entry:
 }
 
 ; Function Attrs: nofree nounwind
-declare noundef i32 @puts(ptr nocapture noundef readonly) #2
+declare noundef i32 @puts(ptr nocapture noundef readonly)
 
-attributes #0 = { uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-sample-profile" "use-soft-float"="false" }
-attributes #1 = { nofree nounwind "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nofree nounwind }
+attributes #0 = { uwtable mustprogress "use-sample-profile" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/Transforms/SampleProfile/offset.ll b/llvm/test/Transforms/SampleProfile/offset.ll
index 0fdeb08..0f0187a 100644
--- a/llvm/test/Transforms/SampleProfile/offset.ll
+++ b/llvm/test/Transforms/SampleProfile/offset.ll
@@ -47,7 +47,7 @@ return:                                           ; preds = %if.else, %if.then
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nounwind uwtable "use-sample-profile" }
 attributes #1 = { nounwind readnone }
 
 !llvm.dbg.cu = !{!0}
diff --git a/llvm/test/Transforms/SampleProfile/profile-context-order.ll b/llvm/test/Transforms/SampleProfile/profile-context-order.ll
index db368bc..eddbb0b 100644
--- a/llvm/test/Transforms/SampleProfile/profile-context-order.ll
+++ b/llvm/test/Transforms/SampleProfile/profile-context-order.ll
@@ -118,8 +118,8 @@ entry:
 
 declare i32 @_Z3foo(i32)
 
-attributes #0 = { nofree noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { nofree nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nofree noinline nounwind uwtable "use-sample-profile" }
+attributes #1 = { nofree nounwind uwtable "use-sample-profile" }
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/Transforms/SampleProfile/profile-context-tracker-debug.ll b/llvm/test/Transforms/SampleProfile/profile-context-tracker-debug.ll
index bb0abb1..aad4e38 100644
--- a/llvm/test/Transforms/SampleProfile/profile-context-tracker-debug.ll
+++ b/llvm/test/Transforms/SampleProfile/profile-context-tracker-debug.ll
@@ -155,8 +155,8 @@ entry:
 
 declare i32 @_Z3fibi(i32)
 
-attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { nofree norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nofree noinline norecurse nounwind uwtable "use-sample-profile" }
+attributes #1 = { nofree norecurse nounwind uwtable "use-sample-profile" }
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll b/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll
index 9d2ac2f..a1bf359 100644
--- a/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll
+++ b/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll
@@ -152,8 +152,8 @@ entry:
 
 declare i32 @_Z3fibi(i32)
 
-attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { nofree norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nofree noinline norecurse nounwind uwtable "use-sample-profile" }
+attributes #1 = { nofree norecurse nounwind uwtable "use-sample-profile" }
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/Transforms/SampleProfile/profile-topdown-order.ll b/llvm/test/Transforms/SampleProfile/profile-topdown-order.ll
index f85ab24..3cb8cd1 100644
--- a/llvm/test/Transforms/SampleProfile/profile-topdown-order.ll
+++ b/llvm/test/Transforms/SampleProfile/profile-topdown-order.ll
@@ -99,8 +99,8 @@ entry:
 
 declare i32 @_Z3foo(i32)
 
-attributes #0 = { nofree noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { nofree nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nofree noinline nounwind uwtable "use-sample-profile" }
+attributes #1 = { nofree nounwind uwtable "use-sample-profile" }
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/Transforms/SampleProfile/propagate.ll b/llvm/test/Transforms/SampleProfile/propagate.ll
index ece85a8..3a07152 100644
--- a/llvm/test/Transforms/SampleProfile/propagate.ll
+++ b/llvm/test/Transforms/SampleProfile/propagate.ll
@@ -123,7 +123,6 @@ for.cond8:                                        ; preds = %for.inc, %if.else7
 ; CHECK: edge %for.cond8 -> %for.body10 probability is 0x7e941a89 / 0x80000000 = 98.89% [HOT edge]
 ; CHECK: edge %for.cond8 -> %for.end probability is 0x016be577 / 0x80000000 = 1.11%
 
-
 for.body10:                                       ; preds = %for.cond8
   %14 = load i64, ptr %j, align 8, !dbg !69
   %15 = load i32, ptr %x.addr, align 4, !dbg !71
@@ -171,7 +170,7 @@ return:                                           ; preds = %if.end20, %if.then
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: norecurse uwtable
 define i32 @main() #2 !dbg !86 {
@@ -198,12 +197,10 @@ entry:
   ret i32 0, !dbg !104
 }
 
-declare i32 @printf(ptr, ...) #3
+declare i32 @printf(ptr, ...)
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { norecurse uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #3 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "use-sample-profile" }
+attributes #2 = { norecurse uwtable "use-sample-profile" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-discriminator.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-discriminator.ll
index 26ae198..7d6fd1b 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-discriminator.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-discriminator.ll
@@ -2,11 +2,10 @@
 ; RUN: opt < %s -passes='default<O2>' -debug-info-for-profiling -pseudo-probe-for-profiling -S | FileCheck %s --check-prefix=PROBE
 ; RUN: opt < %s -passes='thinlto-pre-link<O2>' -debug-info-for-profiling -pseudo-probe-for-profiling -S | FileCheck %s --check-prefix=PROBE
 
-
 @a = dso_local global i32 0, align 4
 
 ; Function Attrs: uwtable
-define void @_Z3foov(i32 %x) #0 !dbg !4 {
+define void @_Z3foov(i32 %x) !dbg !4 {
 bb0:
   %cmp = icmp eq i32 %x, 0, !dbg !10
   br i1 %cmp, label %bb1, label %bb2
@@ -30,13 +29,10 @@ bb3:
   ret void, !dbg !12
 }
 
-declare void @_Z3barv() #1
+declare void @_Z3barv()
 declare void @llvm.lifetime.start.p0(ptr nocapture) nounwind argmemonly
 declare void @llvm.lifetime.end.p0(ptr nocapture) nounwind argmemonly
 
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
@@ -62,7 +58,6 @@ attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "fra
 ; DEBUG: ![[INST]] = !DILocation(line: 4, column: 15, scope: ![[INSTBLOCK:[0-9]+]])
 ; DEBUG: ![[INSTBLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 4)
 
-
 ; PROBE: ![[CALL1]] = !DILocation(line: 4, column: 3, scope: ![[CALL1BLOCK:[0-9]+]])
 ; PROBE: ![[CALL1BLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 455147551)
 ; PROBE: ![[CALL2]] = !DILocation(line: 4, column: 9, scope: ![[CALL2BLOCK:[0-9]+]])
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-icp-factor.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-icp-factor.ll
index 383289e..92f04d5 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-icp-factor.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-icp-factor.ll
@@ -135,7 +135,7 @@ declare dso_local i32 @printf(ptr, ...)
 ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite)
 declare void @llvm.pseudoprobe(i64, i64, i32, i64) #3
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="true" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-sample-profile" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "use-sample-profile" }
 attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 attributes #2 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
 attributes #3 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) }
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-no-debug-info.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-no-debug-info.ll
index e45ddb1..08b7e4c 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-no-debug-info.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-no-debug-info.ll
@@ -1,6 +1,5 @@
 ; RUN: opt < %s -passes='pseudo-probe,cgscc(inline)' -S | FileCheck %s
 
-
 ; CHECK-LABEL: @caller(
 
 ; This instruction did not have a !dbg metadata in the callee but get a !dbg after inlined.
@@ -10,26 +9,23 @@
 ; CHECK-NOT:  call void @llvm.pseudoprobe({{.*}}), !dbg ![[#]]
 ; CHECK:  call void @llvm.pseudoprobe({{.*}})
 
-
 @a = common global i32 0, align 4
 @b = common global i32 0, align 4
 
 ; Function Attrs: nounwind uwtable
-define void @callee() #0 {
+define void @callee() {
 entry:
   store i32 1, ptr @a, align 4
   ret void
 }
 
 ; Function Attrs: nounwind uwtable
-define void @caller() #0 !dbg !4 {
+define void @caller() !dbg !4 {
 entry:
   tail call void @callee(), !dbg !12
   ret void, !dbg !12
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !9}
 !llvm.ident = !{!10}
diff --git a/llvm/test/Transforms/SampleProfile/remarks.ll b/llvm/test/Transforms/SampleProfile/remarks.ll
index 3cb91b7..decf4b1 100644
--- a/llvm/test/Transforms/SampleProfile/remarks.ll
+++ b/llvm/test/Transforms/SampleProfile/remarks.ll
@@ -202,10 +202,10 @@ entry:
   ret i32 %conv, !dbg !58
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nounwind uwtable "use-sample-profile" }
 attributes #1 = { nounwind argmemonly }
 attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind alwaysinline "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind alwaysinline }
 attributes #4 = { nounwind }
 
 !llvm.dbg.cu = !{!0}
diff --git a/llvm/test/Transforms/SampleProfile/uniqname.ll b/llvm/test/Transforms/SampleProfile/uniqname.ll
index 23c3ac9..67988c7 100644
--- a/llvm/test/Transforms/SampleProfile/uniqname.ll
+++ b/llvm/test/Transforms/SampleProfile/uniqname.ll
@@ -65,9 +65,9 @@ if.end:                                           ; preds = %if.then, %entry
   ret void, !dbg !31
 }
 
-declare !dbg !32 dso_local void @_Z10hoo_calleev() #3
+declare !dbg !32 dso_local void @_Z10hoo_calleev()
 
-declare !dbg !33 dso_local void @_Z10moo_calleev() #3
+declare !dbg !33 dso_local void @_Z10moo_calleev()
 
 ; Function Attrs: uwtable mustprogress
 define internal void @_ZL3noov.__uniq.334154460836426447066042049082945760258() #1 !dbg !34 {
@@ -84,12 +84,11 @@ if.end:                                           ; preds = %if.then, %entry
   ret void, !dbg !37
 }
 
-declare !dbg !38 dso_local void @_Z10noo_calleev() #3
+declare !dbg !38 dso_local void @_Z10noo_calleev()
 
-attributes #0 = { uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-sample-profile" "use-soft-float"="false" }
-attributes #1 = { uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "sample-profile-suffix-elision-policy"="selected" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-sample-profile" "use-soft-float"="false" }
-attributes #2 = { noinline uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "sample-profile-suffix-elision-policy"="selected" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-sample-profile" "use-soft-float"="false" }
-attributes #3 = { "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { uwtable mustprogress "use-sample-profile" }
+attributes #1 = { uwtable mustprogress "use-sample-profile" "sample-profile-suffix-elision-policy"="selected" }
+attributes #2 = { noinline uwtable mustprogress "use-sample-profile" "sample-profile-suffix-elision-policy"="selected" }
 
 ; CHECK: ![[PROF_ID1]] = !{!"branch_weights", i32 5931}
 ; CHECK: ![[PROF_ID2]] = !{!"branch_weights", i32 2000}
diff --git a/llvm/test/Transforms/Scalarizer/dbginfo.ll b/llvm/test/Transforms/Scalarizer/dbginfo.ll
index 464e752..d6b8aa2 100644
--- a/llvm/test/Transforms/Scalarizer/dbginfo.ll
+++ b/llvm/test/Transforms/Scalarizer/dbginfo.ll
@@ -2,7 +2,7 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Function Attrs: nounwind uwtable
-define void @f1(ptr nocapture %a, ptr nocapture readonly %b, ptr nocapture readonly %c) #0 !dbg !4 {
+define void @f1(ptr nocapture %a, ptr nocapture readonly %b, ptr nocapture readonly %c) !dbg !4 {
 ; CHECK: @f1(
 ; CHECK: %a.i1 = getelementptr i32, ptr %a, i32 1
 ; CHECK: %a.i2 = getelementptr i32, ptr %a, i32 2
@@ -45,10 +45,7 @@ entry:
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!18, !26}
diff --git a/llvm/test/Transforms/SimplifyCFG/Hexagon/switch-to-lookup-table.ll b/llvm/test/Transforms/SimplifyCFG/Hexagon/switch-to-lookup-table.ll
index ccc8def..7aa6ced 100644
--- a/llvm/test/Transforms/SimplifyCFG/Hexagon/switch-to-lookup-table.ll
+++ b/llvm/test/Transforms/SimplifyCFG/Hexagon/switch-to-lookup-table.ll
@@ -7,7 +7,7 @@ target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i
 target triple = "hexagon-unknown--elf"
 
 ; Function Attrs: noinline nounwind
-define i32 @foo(i32 %x) #0 section ".tcm_text" {
+define i32 @foo(i32 %x) section ".tcm_text" {
 ; ENABLE-LABEL: @foo(
 ; ENABLE-NEXT:  entry:
 ; ENABLE-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[X:%.*]], 6
@@ -92,5 +92,3 @@ return:                                           ; preds = %sw.default, %sw.bb5
   %1 = load i32, ptr %retval, align 4
   ret i32 %1
 }
-
-attributes #0 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv60" "target-features"="-hvx,-long-calls" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/merge-cleanuppads.ll b/llvm/test/Transforms/SimplifyCFG/X86/merge-cleanuppads.ll
index 6e6c97f..2d21a59 100644
--- a/llvm/test/Transforms/SimplifyCFG/X86/merge-cleanuppads.ll
+++ b/llvm/test/Transforms/SimplifyCFG/X86/merge-cleanuppads.ll
@@ -3,37 +3,33 @@ target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-pc-windows-msvc18.0.0"
 
 ; Function Attrs: uwtable
-define void @test1() #0 personality ptr @__CxxFrameHandler3 {
+define void @test1() personality ptr @__CxxFrameHandler3 {
 entry:
   invoke void @may_throw(i32 3)
           to label %invoke.cont unwind label %ehcleanup
 
 invoke.cont:                                      ; preds = %entry
-  tail call void @may_throw(i32 2) #2
-  tail call void @may_throw(i32 1) #2
+  tail call void @may_throw(i32 2)
+  tail call void @may_throw(i32 1)
   ret void
 
 ehcleanup:                                        ; preds = %entry
   %cp = cleanuppad within none []
-  tail call void @may_throw(i32 2) #2 [ "funclet"(token %cp) ]
+  tail call void @may_throw(i32 2) [ "funclet"(token %cp) ]
   cleanupret from %cp unwind label %ehcleanup2
 
 ehcleanup2:
   %cp2 = cleanuppad within none []
-  tail call void @may_throw(i32 1) #2 [ "funclet"(token %cp2) ]
+  tail call void @may_throw(i32 1) [ "funclet"(token %cp2) ]
   cleanupret from %cp2 unwind to caller
 }
 
 ; CHECK-LABEL: define void @test1(
 ; CHECK: %[[cp:.*]] = cleanuppad within none []
-; CHECK: tail call void @may_throw(i32 2) #2 [ "funclet"(token %[[cp]]) ]
-; CHECK: tail call void @may_throw(i32 1) #2 [ "funclet"(token %[[cp]]) ]
+; CHECK: tail call void @may_throw(i32 2) [ "funclet"(token %[[cp]]) ]
+; CHECK: tail call void @may_throw(i32 1) [ "funclet"(token %[[cp]]) ]
 ; CHECK: cleanupret from %[[cp]] unwind to caller
 
-declare void @may_throw(i32) #1
+declare void @may_throw(i32)
 
 declare i32 @__CxxFrameHandler3(...)
-
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
diff --git a/llvm/test/Transforms/SimplifyCFG/pr50060-constantfold-loopid.ll b/llvm/test/Transforms/SimplifyCFG/pr50060-constantfold-loopid.ll
index 19e1c73..0363792 100644
--- a/llvm/test/Transforms/SimplifyCFG/pr50060-constantfold-loopid.ll
+++ b/llvm/test/Transforms/SimplifyCFG/pr50060-constantfold-loopid.ll
@@ -13,7 +13,7 @@
 @C = dso_local global i32 0, align 4
 
 ; Function Attrs: nounwind
-define dso_local void @_Z6test01v() addrspace(1) #0 {
+define dso_local void @_Z6test01v() addrspace(1) {
 ; CHECK-LABEL: @_Z6test01v(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[J:%.*]] = alloca i32, align 4
@@ -22,7 +22,7 @@ define dso_local void @_Z6test01v() addrspace(1) #0 {
 ; CHECK:       do.body:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @C, align 4, !tbaa [[TBAA2:![0-9]+]]
 ; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
-; CHECK-NEXT:    call addrspace(1) void @llvm.lifetime.start.p0(ptr [[J]]) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT:    call addrspace(1) void @llvm.lifetime.start.p0(ptr [[J]])
 ; CHECK-NEXT:    store i32 0, ptr [[J]], align 4, !tbaa [[TBAA2]]
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
 ; CHECK:       for.cond:
@@ -30,11 +30,11 @@ define dso_local void @_Z6test01v() addrspace(1) #0 {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 3
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    call addrspace(1) void @llvm.lifetime.end.p0(ptr [[J]]) #[[ATTR2]]
+; CHECK-NEXT:    call addrspace(1) void @llvm.lifetime.end.p0(ptr [[J]])
 ; CHECK-NEXT:    br label [[DO_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    store i32 undef, ptr [[I]], align 4
-; CHECK-NEXT:    call addrspace(1) void @llvm.lifetime.start.p0(ptr [[I]]) #[[ATTR2]]
+; CHECK-NEXT:    call addrspace(1) void @llvm.lifetime.start.p0(ptr [[I]])
 ; CHECK-NEXT:    store i32 0, ptr [[I]], align 4, !tbaa [[TBAA2]]
 ; CHECK-NEXT:    br label [[FOR_COND1:%.*]]
 ; CHECK:       for.cond1:
@@ -43,7 +43,7 @@ define dso_local void @_Z6test01v() addrspace(1) #0 {
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_BODY4:%.*]], label [[FOR_COND_CLEANUP3:%.*]]
 ; CHECK:       for.cond.cleanup3:
-; CHECK-NEXT:    call addrspace(1) void @llvm.lifetime.end.p0(ptr [[I]]) #[[ATTR2]]
+; CHECK-NEXT:    call addrspace(1) void @llvm.lifetime.end.p0(ptr [[I]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[J]], align 4, !tbaa [[TBAA2]]
 ; CHECK-NEXT:    [[INC7:%.*]] = add nsw i32 [[TMP4]], 1
 ; CHECK-NEXT:    store i32 [[INC7]], ptr [[J]], align 4, !tbaa [[TBAA2]]
@@ -64,7 +64,7 @@ entry:
 do.body:                                          ; preds = %do.cond, %entry
   %0 = load i32, ptr @C, align 4, !tbaa !2
   %inc = add nsw i32 %0, 1
-  call addrspace(1) void @llvm.lifetime.start.p0(ptr %j) #2
+  call addrspace(1) void @llvm.lifetime.start.p0(ptr %j)
   store i32 0, ptr %j, align 4, !tbaa !2
   br label %for.cond
 
@@ -74,12 +74,12 @@ for.cond:                                         ; preds = %for.inc6, %do.body
   br i1 %cmp, label %for.body, label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %for.cond
-  call addrspace(1) void @llvm.lifetime.end.p0(ptr %j) #2
+  call addrspace(1) void @llvm.lifetime.end.p0(ptr %j)
   br label %for.end8
 
 for.body:                                         ; preds = %for.cond
   store i32 undef, ptr %i, align 4
-  call addrspace(1) void @llvm.lifetime.start.p0(ptr %i) #2
+  call addrspace(1) void @llvm.lifetime.start.p0(ptr %i)
   store i32 0, ptr %i, align 4, !tbaa !2
   br label %for.cond1
 
@@ -90,7 +90,7 @@ for.cond1:                                        ; preds = %for.inc, %for.body
   br i1 %cmp2, label %for.body4, label %for.cond.cleanup3
 
 for.cond.cleanup3:                                ; preds = %for.cond1
-  call addrspace(1) void @llvm.lifetime.end.p0(ptr %i) #2
+  call addrspace(1) void @llvm.lifetime.end.p0(ptr %i)
   br label %for.end
 
 for.body4:                                        ; preds = %for.cond1
@@ -124,14 +124,10 @@ do.end:                                           ; preds = %do.cond
 }
 
 ; Function Attrs: argmemonly nofree nosync nounwind willreturn
-declare void @llvm.lifetime.start.p0(ptr nocapture) addrspace(1) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture) addrspace(1)
 
 ; Function Attrs: argmemonly nofree nosync nounwind willreturn
-declare void @llvm.lifetime.end.p0(ptr nocapture) addrspace(1) #1
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nofree nosync nounwind willreturn }
-attributes #2 = { nounwind }
+declare void @llvm.lifetime.end.p0(ptr nocapture) addrspace(1)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/pr23975.ll b/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/pr23975.ll
index b3cbc3d..0d3846d 100644
--- a/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/pr23975.ll
+++ b/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/pr23975.ll
@@ -7,7 +7,7 @@ target triple = "amdgcn--"
 %struct.Matrix4x4 = type { [4 x [4 x float]] }
 
 ; Function Attrs: nounwind
-define fastcc void @Accelerator_Intersect(ptr addrspace(1) nocapture readonly %leafTransformations) #0 {
+define fastcc void @Accelerator_Intersect(ptr addrspace(1) nocapture readonly %leafTransformations) {
 ; CHECK-LABEL:  @Accelerator_Intersect(
 entry:
   %tmp = sext i32 undef to i64
@@ -17,5 +17,3 @@ entry:
   %tmp2 = load <4 x float>, ptr addrspace(1) undef, align 4
   ret void
 }
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "target-cpu"="tahiti" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/StructurizeCFG/nested-loop-order.ll b/llvm/test/Transforms/StructurizeCFG/nested-loop-order.ll
index 91e9511..11753cf 100644
--- a/llvm/test/Transforms/StructurizeCFG/nested-loop-order.ll
+++ b/llvm/test/Transforms/StructurizeCFG/nested-loop-order.ll
@@ -63,7 +63,3 @@ ENDIF28:                                          ; preds = %ENDIF
   %tmp36 = icmp sgt i32 %tmp20, 2
   br i1 %tmp36, label %ENDLOOP, label %LOOP.outer
 }
-
-attributes #0 = { "enable-no-nans-fp-math"="true" "unsafe-fp-math"="true" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { readnone }
diff --git a/llvm/test/Transforms/Util/PredicateInfo/testandor.ll b/llvm/test/Transforms/Util/PredicateInfo/testandor.ll
index 2e96a92..cc1dc4e 100644
--- a/llvm/test/Transforms/Util/PredicateInfo/testandor.ll
+++ b/llvm/test/Transforms/Util/PredicateInfo/testandor.ll
@@ -994,3 +994,30 @@ define void @test_assume_deep_and_tree(i1 %a1) {
   call void @foo(i1 %a15)
   ret void
 }
+
+define i32 @test_and_with_phinode(i32 %x) {
+; CHECK-LABEL: @test_and_with_phinode(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[XGE1:%.*]] = icmp uge i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[XLT2:%.*]] = icmp ult i32 [[X]], 2
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[XGE1]], [[XLT2]]
+; CHECK:         [[X_0_1:%.*]] = bitcast i32 [[X]] to i32
+; CHECK:         [[X_0_2:%.*]] = bitcast i32 [[X_0_1]] to i32
+; CHECK-NEXT:    br i1 [[AND]], label [[PHI:%.*]], label [[NOPE:%.*]]
+; CHECK:       nope:
+; CHECK-NEXT:    br label [[PHI]]
+; CHECK:       phi:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[X_0_2]], [[ENTRY:%.*]] ], [ 1, [[NOPE]] ]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %xge1 = icmp uge i32 %x, 1
+  %xlt2 = icmp ult i32 %x, 2
+  %and = and i1 %xge1, %xlt2
+  br i1 %and, label %phi, label %nope
+nope:
+  br label %phi
+phi:
+  %res = phi i32 [ %x, %entry ], [ 1, %nope ]
+  ret i32 %res
+}
diff --git a/llvm/test/Transforms/Util/dbg-user-of-aext.ll b/llvm/test/Transforms/Util/dbg-user-of-aext.ll
index 9e7935e..b3d1b90 100644
--- a/llvm/test/Transforms/Util/dbg-user-of-aext.ll
+++ b/llvm/test/Transforms/Util/dbg-user-of-aext.ll
@@ -27,7 +27,7 @@
 %struct.foo = type { i8, i64 }
 
 ; Function Attrs: noinline nounwind uwtable
-define void @_Z1fbb3foo(i1 zeroext %b, i1 zeroext %frag, i8 %g.coerce0, i64 %g.coerce1) #0 !dbg !6 {
+define void @_Z1fbb3foo(i1 zeroext %b, i1 zeroext %frag, i8 %g.coerce0, i64 %g.coerce1) !dbg !6 {
 entry:
   %g = alloca %struct.foo, align 8
   %b.addr = alloca i8, align 1
@@ -51,10 +51,7 @@ entry:
 ; CHECK: ![[VAR_FRAG]] = !DILocalVariable(name: "frag"
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
-attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
diff --git a/llvm/test/Transforms/Util/libcalls-fast-math-inf-loop.ll b/llvm/test/Transforms/Util/libcalls-fast-math-inf-loop.ll
index ad23bf7..e9f0c8c 100644
--- a/llvm/test/Transforms/Util/libcalls-fast-math-inf-loop.ll
+++ b/llvm/test/Transforms/Util/libcalls-fast-math-inf-loop.ll
@@ -19,18 +19,18 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
 
 ; Function Attrs: nounwind
-define float @fn(float %f) #0 {
+define float @fn(float %f) {
 ; CHECK: define float @fn(
 ; CHECK: call fast float @expf(
   %f.addr = alloca float, align 4
   store float %f, ptr %f.addr, align 4, !tbaa !1
   %1 = load float, ptr %f.addr, align 4, !tbaa !1
-  %call = call fast float @expf(float %1) #3
+  %call = call fast float @expf(float %1)
   ret float %call
 }
 
 ; Function Attrs: inlinehint nounwind readnone
-define available_externally float @expf(float %x) #1 {
+define available_externally float @expf(float %x) {
 ; CHECK: define available_externally float @expf(
 ; CHECK: fpext float
 ; CHECK: call fast double @exp(
@@ -39,17 +39,13 @@ define available_externally float @expf(float %x) #1 {
   store float %x, ptr %x.addr, align 4, !tbaa !1
   %1 = load float, ptr %x.addr, align 4, !tbaa !1
   %conv = fpext float %1 to double
-  %call = call fast double @exp(double %conv) #3
+  %call = call fast double @exp(double %conv)
   %conv1 = fptrunc double %call to float
   ret float %conv1
 }
 
 ; Function Attrs: nounwind readnone
-declare double @exp(double) #2
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { inlinehint nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
+declare double @exp(double)
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/Verifier/atomics.ll b/llvm/test/Verifier/atomics.ll
index f835b98..17bf5a0 100644
--- a/llvm/test/Verifier/atomics.ll
+++ b/llvm/test/Verifier/atomics.ll
@@ -1,14 +1,15 @@
 ; RUN: not opt -passes=verify < %s 2>&1 | FileCheck %s
+; CHECK: atomic store operand must have integer, pointer, floating point, or vector type!
+; CHECK: atomic load operand must have integer, pointer, floating point, or vector type!
 
-; CHECK: atomic store operand must have integer, pointer, or floating point type!
-; CHECK: atomic load operand must have integer, pointer, or floating point type!
+%ty = type { i32 };
 
-define void @foo(ptr %P, <1 x i64> %v) {
-  store atomic <1 x i64> %v, ptr %P unordered, align 8
+define void @foo(ptr %P, %ty %v) {
+  store atomic %ty %v, ptr %P unordered, align 8
   ret void
 }
 
-define <1 x i64> @bar(ptr %P) {
-  %v = load atomic <1 x i64>, ptr %P unordered, align 8
-  ret <1 x i64> %v
+define %ty @bar(ptr %P) {
+  %v = load atomic %ty, ptr %P unordered, align 8
+  ret %ty %v
 }
diff --git a/llvm/test/tools/llvm-ir2vec/embeddings-flowaware.ll b/llvm/test/tools/llvm-ir2vec/embeddings-flowaware.ll
index b2362f8..ade228d 100644
--- a/llvm/test/tools/llvm-ir2vec/embeddings-flowaware.ll
+++ b/llvm/test/tools/llvm-ir2vec/embeddings-flowaware.ll
@@ -49,7 +49,7 @@ entry:
 ; CHECK-FUNC-LEVEL-ABC: Function: abc
 ; CHECK-FUNC-LEVEL-NEXT-ABC:  [ 3630.00  3672.00  3714.00 ]
 
-; CHECK-FUNC-DEF: Error: Function 'def' not found
+; CHECK-FUNC-DEF: error: Function 'def' not found
 
 ; CHECK-BB-LEVEL: Function: abc
 ; CHECK-BB-LEVEL-NEXT: entry: [ 3630.00  3672.00  3714.00 ]
diff --git a/llvm/test/tools/llvm-ir2vec/embeddings-symbolic.ll b/llvm/test/tools/llvm-ir2vec/embeddings-symbolic.ll
index f9aa108..9d60e12 100644
--- a/llvm/test/tools/llvm-ir2vec/embeddings-symbolic.ll
+++ b/llvm/test/tools/llvm-ir2vec/embeddings-symbolic.ll
@@ -49,7 +49,7 @@ entry:
 ; CHECK-FUNC-LEVEL-ABC: Function: abc
 ; CHECK-FUNC-LEVEL-NEXT-ABC:  [ 878.00  889.00  900.00 ]
 
-; CHECK-FUNC-DEF: Error: Function 'def' not found
+; CHECK-FUNC-DEF: error: Function 'def' not found
 
 ; CHECK-BB-LEVEL: Function: abc
 ; CHECK-BB-LEVEL-NEXT: entry: [ 878.00  889.00  900.00 ]
diff --git a/llvm/test/tools/llvm-ir2vec/embeddings-symbolic.mir b/llvm/test/tools/llvm-ir2vec/embeddings-symbolic.mir
new file mode 100644
index 0000000..ef835fe
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/embeddings-symbolic.mir
@@ -0,0 +1,92 @@
+# REQUIRES: x86_64-linux
+# RUN: llvm-ir2vec embeddings --mode=mir --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json %s | FileCheck %s -check-prefix=CHECK-DEFAULT
+# RUN: llvm-ir2vec embeddings --mode=mir --level=func --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json %s | FileCheck %s -check-prefix=CHECK-FUNC-LEVEL
+# RUN: llvm-ir2vec embeddings --mode=mir --level=func --function=add_function --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json %s | FileCheck %s -check-prefix=CHECK-FUNC-LEVEL-ADD
+# RUN: not llvm-ir2vec embeddings --mode=mir --level=func --function=missing_function --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json %s 2>&1 | FileCheck %s -check-prefix=CHECK-FUNC-MISSING
+# RUN: llvm-ir2vec embeddings --mode=mir --level=bb --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json %s | FileCheck %s -check-prefix=CHECK-BB-LEVEL
+# RUN: llvm-ir2vec embeddings --mode=mir --level=inst --function=add_function --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json %s | FileCheck %s -check-prefix=CHECK-INST-LEVEL
+
+--- |
+  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64-unknown-linux-gnu"
+  
+  define dso_local noundef i32 @add_function(i32 noundef %a, i32 noundef %b) {
+  entry:
+    %sum = add nsw i32 %a, %b
+    %result = mul nsw i32 %sum, 2
+    ret i32 %result
+  }
+  
+  define dso_local void @simple_function() {
+  entry:
+    ret void
+  }
+...
+---
+name:            add_function
+alignment:       16
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr32 }
+  - { id: 1, class: gr32 }
+  - { id: 2, class: gr32 }
+  - { id: 3, class: gr32 }
+liveins:
+  - { reg: '$edi', virtual-reg: '%0' }
+  - { reg: '$esi', virtual-reg: '%1' }
+body:             |
+  bb.0.entry:
+    liveins: $edi, $esi
+  
+    %1:gr32 = COPY $esi
+    %0:gr32 = COPY $edi
+    %2:gr32 = nsw ADD32rr %0, %1, implicit-def dead $eflags
+    %3:gr32 = ADD32rr %2, %2, implicit-def dead $eflags
+    $eax = COPY %3
+    RET 0, $eax
+
+---
+name:            simple_function
+alignment:       16
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    RET 0
+
+# CHECK-DEFAULT: MIR2Vec embeddings for machine function add_function:
+# CHECK-DEFAULT-NEXT: Function vector:  [ 26.50  27.10  27.70 ]
+# CHECK-DEFAULT: MIR2Vec embeddings for machine function simple_function:
+# CHECK-DEFAULT-NEXT: Function vector:  [ 1.10  1.20  1.30 ]
+
+# CHECK-FUNC-LEVEL: MIR2Vec embeddings for machine function add_function:
+# CHECK-FUNC-LEVEL-NEXT: Function vector:  [ 26.50  27.10  27.70 ]
+# CHECK-FUNC-LEVEL: MIR2Vec embeddings for machine function simple_function:
+# CHECK-FUNC-LEVEL-NEXT: Function vector:  [ 1.10  1.20  1.30 ]
+
+# CHECK-FUNC-LEVEL-ADD: MIR2Vec embeddings for machine function add_function:
+# CHECK-FUNC-LEVEL-ADD-NEXT: Function vector:  [ 26.50  27.10  27.70 ]
+# CHECK-FUNC-LEVEL-ADD-NOT: simple_function
+
+# CHECK-FUNC-MISSING: error: Function 'missing_function' not found
+
+# CHECK-BB-LEVEL: MIR2Vec embeddings for machine function add_function:
+# CHECK-BB-LEVEL-NEXT: Basic block vectors:
+# CHECK-BB-LEVEL-NEXT: MBB entry:  [ 26.50  27.10  27.70 ]
+# CHECK-BB-LEVEL: MIR2Vec embeddings for machine function simple_function:
+# CHECK-BB-LEVEL-NEXT: Basic block vectors:
+# CHECK-BB-LEVEL-NEXT: MBB entry:  [ 1.10  1.20  1.30 ]
+
+# CHECK-INST-LEVEL: MIR2Vec embeddings for machine function add_function:
+# CHECK-INST-LEVEL-NEXT: Instruction vectors:
+# CHECK-INST-LEVEL: %1:gr32 = COPY $esi
+# CHECK-INST-LEVEL-NEXT:  ->  [ 6.00  6.10  6.20 ]
+# CHECK-INST-LEVEL-NEXT: %0:gr32 = COPY $edi
+# CHECK-INST-LEVEL-NEXT:  ->  [ 6.00  6.10  6.20 ]
+# CHECK-INST-LEVEL: %2:gr32 = nsw ADD32rr
+# CHECK-INST-LEVEL:  ->  [ 3.70  3.80  3.90 ]
+# CHECK-INST-LEVEL: %3:gr32 = ADD32rr
+# CHECK-INST-LEVEL:  ->  [ 3.70  3.80  3.90 ]
+# CHECK-INST-LEVEL: $eax = COPY %3:gr32
+# CHECK-INST-LEVEL-NEXT:  ->  [ 6.00  6.10  6.20 ]
+# CHECK-INST-LEVEL: RET 0, $eax
+# CHECK-INST-LEVEL-NEXT:  ->  [ 1.10  1.20  1.30 ]
diff --git a/llvm/test/tools/llvm-ir2vec/entities.mir b/llvm/test/tools/llvm-ir2vec/entities.mir
new file mode 100644
index 0000000..60d9c7a
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/entities.mir
@@ -0,0 +1,28 @@
+# REQUIRES: x86_64-linux
+# RUN: llvm-ir2vec entities --mode=mir %s -o 2>&1 %t1.log
+# RUN: diff %S/output/reference_x86_entities.txt %t1.log
+
+--- |
+  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64-unknown-linux-gnu"
+  
+  define dso_local noundef i32 @test_function(i32 noundef %a) {
+  entry:
+    ret i32 %a
+  }
+...
+---
+name:            test_function
+alignment:       16
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr32 }
+liveins:
+  - { reg: '$edi', virtual-reg: '%0' }
+body:             |
+  bb.0.entry:
+    liveins: $edi
+  
+    %0:gr32 = COPY $edi
+    $eax = COPY %0
+    RET 0, $eax
diff --git a/llvm/test/tools/llvm-ir2vec/error-handling.ll b/llvm/test/tools/llvm-ir2vec/error-handling.ll
index b944ea0..8e9e455 100644
--- a/llvm/test/tools/llvm-ir2vec/error-handling.ll
+++ b/llvm/test/tools/llvm-ir2vec/error-handling.ll
@@ -10,4 +10,4 @@ entry:
 }
 
 ; CHECK-NO-VOCAB: error: IR2Vec vocabulary file path not specified; You may need to set it using --ir2vec-vocab-path
-; CHECK-FUNC-NOT-FOUND: Error: Function 'nonexistent' not found
+; CHECK-FUNC-NOT-FOUND: error: Function 'nonexistent' not found
diff --git a/llvm/test/tools/llvm-ir2vec/error-handling.mir b/llvm/test/tools/llvm-ir2vec/error-handling.mir
new file mode 100644
index 0000000..caec454c
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/error-handling.mir
@@ -0,0 +1,41 @@
+# REQUIRES: x86_64-linux
+# Test error handling and input validation for llvm-ir2vec tool in MIR mode
+
+# RUN: not llvm-ir2vec embeddings --mode=mir %s 2>&1 | FileCheck %s -check-prefix=CHECK-NO-VOCAB
+# RUN: not llvm-ir2vec embeddings --mode=mir --mir2vec-vocab-path=%S/nonexistent-vocab.json %s 2>&1 | FileCheck %s -check-prefix=CHECK-VOCAB-NOT-FOUND
+# RUN: not llvm-ir2vec embeddings --mode=mir --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_invalid_vocab.json %s 2>&1 | FileCheck %s -check-prefix=CHECK-INVALID-VOCAB
+# RUN: not llvm-ir2vec embeddings --mode=mir --function=nonexistent_function --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json %s 2>&1 | FileCheck %s -check-prefix=CHECK-FUNC-NOT-FOUND
+
+--- |
+  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64-unknown-linux-gnu"
+  
+  define dso_local noundef i32 @test_function(i32 noundef %a) {
+  entry:
+    ret i32 %a
+  }
+...
+---
+name:            test_function
+alignment:       16
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr32 }
+liveins:
+  - { reg: '$edi', virtual-reg: '%0' }
+body:             |
+  bb.0.entry:
+    liveins: $edi
+  
+    %0:gr32 = COPY $edi
+    $eax = COPY %0
+    RET 0, $eax
+
+# CHECK-NO-VOCAB: error: Failed to load MIR2Vec vocabulary - MIR2Vec vocabulary file path not specified; set it using --mir2vec-vocab-path
+
+# CHECK-VOCAB-NOT-FOUND: error: Failed to load MIR2Vec vocabulary
+# CHECK-VOCAB-NOT-FOUND: No such file or directory
+
+# CHECK-INVALID-VOCAB: error: Failed to load MIR2Vec vocabulary - Missing 'Opcodes' section in vocabulary file
+
+# CHECK-FUNC-NOT-FOUND: error: Function 'nonexistent_function' not found
diff --git a/llvm/test/tools/llvm-ir2vec/output/lit.local.cfg b/llvm/test/tools/llvm-ir2vec/output/lit.local.cfg
new file mode 100644
index 0000000..2406f19
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/output/lit.local.cfg
@@ -0,0 +1,3 @@
+# Don't treat files in this directory as tests
+# These are reference data files, not test scripts
+config.suffixes = []
diff --git a/llvm/test/tools/llvm-ir2vec/output/reference_triplets.txt b/llvm/test/tools/llvm-ir2vec/output/reference_triplets.txt
new file mode 100644
index 0000000..dfbac4c
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/output/reference_triplets.txt
@@ -0,0 +1,33 @@
+MAX_RELATION=4
+187	7072	1
+187	6968	2
+187	187	0
+187	7072	1
+187	6969	2
+187	10	0
+10	7072	1
+10	7072	2
+10	7072	3
+10	6961	4
+10	187	0
+187	6952	1
+187	7072	2
+187	1555	0
+1555	6882	1
+1555	6952	2
+187	7072	1
+187	6968	2
+187	187	0
+187	7072	1
+187	6969	2
+187	601	0
+601	7072	1
+601	7072	2
+601	7072	3
+601	6961	4
+601	187	0
+187	6952	1
+187	7072	2
+187	1555	0
+1555	6882	1
+1555	6952	2
diff --git a/llvm/test/tools/llvm-ir2vec/output/reference_x86_entities.txt b/llvm/test/tools/llvm-ir2vec/output/reference_x86_entities.txt
new file mode 100644
index 0000000..dc436d1
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/output/reference_x86_entities.txt
@@ -0,0 +1,7174 @@
+7173
+AAA	0
+AAD	1
+AADD	2
+AAM	3
+AAND	4
+AAS	5
+ABS_F	6
+ABS_Fp	7
+ADC	8
+ADCX	9
+ADD	10
+ADDPDrm	11
+ADDPDrr	12
+ADDPSrm	13
+ADDPSrr	14
+ADDR	15
+ADDSDrm	16
+ADDSDrm_Int	17
+ADDSDrr	18
+ADDSDrr_Int	19
+ADDSSrm	20
+ADDSSrm_Int	21
+ADDSSrr	22
+ADDSSrr_Int	23
+ADDSUBPDrm	24
+ADDSUBPDrr	25
+ADDSUBPSrm	26
+ADDSUBPSrr	27
+ADD_F	28
+ADD_FI	29
+ADD_FPrST	30
+ADD_FST	31
+ADD_Fp	32
+ADD_FpI	33
+ADD_FrST	34
+ADJCALLSTACKDOWN	35
+ADJCALLSTACKUP	36
+ADOX	37
+AESDEC	38
+AESDECLASTrm	39
+AESDECLASTrr	40
+AESDECWIDE	41
+AESDECrm	42
+AESDECrr	43
+AESENC	44
+AESENCLASTrm	45
+AESENCLASTrr	46
+AESENCWIDE	47
+AESENCrm	48
+AESENCrr	49
+AESIMCrm	50
+AESIMCrr	51
+AESKEYGENASSISTrmi	52
+AESKEYGENASSISTrri	53
+AND	54
+ANDN	55
+ANDNPDrm	56
+ANDNPDrr	57
+ANDNPSrm	58
+ANDNPSrr	59
+ANDPDrm	60
+ANDPDrr	61
+ANDPSrm	62
+ANDPSrr	63
+ANNOTATION_LABEL	64
+AOR	65
+ARITH_FENCE	66
+ARPL	67
+ASAN_CHECK_MEMACCESS	68
+AVX	69
+AVX_SET	70
+AXOR	71
+BEXTR	72
+BEXTRI	73
+BLCFILL	74
+BLCI	75
+BLCIC	76
+BLCMSK	77
+BLCS	78
+BLENDPDrmi	79
+BLENDPDrri	80
+BLENDPSrmi	81
+BLENDPSrri	82
+BLENDVPDrm	83
+BLENDVPDrr	84
+BLENDVPSrm	85
+BLENDVPSrr	86
+BLSFILL	87
+BLSI	88
+BLSIC	89
+BLSMSK	90
+BLSR	91
+BOUNDS	92
+BSF	93
+BSR	94
+BSWAP	95
+BT	96
+BTC	97
+BTR	98
+BTS	99
+BUNDLE	100
+BZHI	101
+CALL	102
+CALLpcrel	103
+CATCHRET	104
+CBW	105
+CCMP	106
+CDQ	107
+CDQE	108
+CFCMOV	109
+CFI_INSTRUCTION	110
+CHS_F	111
+CHS_Fp	112
+CLAC	113
+CLC	114
+CLD	115
+CLDEMOTE	116
+CLEANUPRET	117
+CLFLUSH	118
+CLFLUSHOPT	119
+CLGI	120
+CLI	121
+CLRSSBSY	122
+CLTS	123
+CLUI	124
+CLWB	125
+CLZERO	126
+CMC	127
+CMOV	128
+CMOVBE_F	129
+CMOVBE_Fp	130
+CMOVB_F	131
+CMOVB_Fp	132
+CMOVE_F	133
+CMOVE_Fp	134
+CMOVNBE_F	135
+CMOVNBE_Fp	136
+CMOVNB_F	137
+CMOVNB_Fp	138
+CMOVNE_F	139
+CMOVNE_Fp	140
+CMOVNP_F	141
+CMOVNP_Fp	142
+CMOVP_F	143
+CMOVP_Fp	144
+CMOV_FR	145
+CMOV_GR	146
+CMOV_RFP	147
+CMOV_VK	148
+CMOV_VR	149
+CMP	150
+CMPCCXADDmr	151
+CMPPDrmi	152
+CMPPDrri	153
+CMPPSrmi	154
+CMPPSrri	155
+CMPSB	156
+CMPSDrmi	157
+CMPSDrmi_Int	158
+CMPSDrri	159
+CMPSDrri_Int	160
+CMPSL	161
+CMPSQ	162
+CMPSSrmi	163
+CMPSSrmi_Int	164
+CMPSSrri	165
+CMPSSrri_Int	166
+CMPSW	167
+CMPXCHG	168
+COMISDrm	169
+COMISDrm_Int	170
+COMISDrr	171
+COMISDrr_Int	172
+COMISSrm	173
+COMISSrm_Int	174
+COMISSrr	175
+COMISSrr_Int	176
+COMP_FST	177
+COM_FIPr	178
+COM_FIr	179
+COM_FST	180
+COM_FpIr	181
+COM_Fpr	182
+CONVERGENCECTRL_ANCHOR	183
+CONVERGENCECTRL_ENTRY	184
+CONVERGENCECTRL_GLUE	185
+CONVERGENCECTRL_LOOP	186
+COPY	187
+COPY_TO_REGCLASS	188
+CPUID	189
+CQO	190
+CRC	191
+CS_PREFIX	192
+CTEST	193
+CVTDQ	194
+CVTPD	195
+CVTPS	196
+CVTSD	197
+CVTSI	198
+CVTSS	199
+CVTTPD	200
+CVTTPS	201
+CVTTSD	202
+CVTTSS	203
+CWD	204
+CWDE	205
+DAA	206
+DAS	207
+DATA	208
+DBG_INSTR_REF	209
+DBG_LABEL	210
+DBG_PHI	211
+DBG_VALUE	212
+DBG_VALUE_LIST	213
+DEC	214
+DIV	215
+DIVPDrm	216
+DIVPDrr	217
+DIVPSrm	218
+DIVPSrr	219
+DIVR_F	220
+DIVR_FI	221
+DIVR_FPrST	222
+DIVR_FST	223
+DIVR_Fp	224
+DIVR_FpI	225
+DIVR_FrST	226
+DIVSDrm	227
+DIVSDrm_Int	228
+DIVSDrr	229
+DIVSDrr_Int	230
+DIVSSrm	231
+DIVSSrm_Int	232
+DIVSSrr	233
+DIVSSrr_Int	234
+DIV_F	235
+DIV_FI	236
+DIV_FPrST	237
+DIV_FST	238
+DIV_Fp	239
+DIV_FpI	240
+DIV_FrST	241
+DPPDrmi	242
+DPPDrri	243
+DPPSrmi	244
+DPPSrri	245
+DS_PREFIX	246
+DYN_ALLOCA	247
+EH_LABEL	248
+EH_RETURN	249
+EH_SjLj_LongJmp	250
+EH_SjLj_SetJmp	251
+EH_SjLj_Setup	252
+ENCLS	253
+ENCLU	254
+ENCLV	255
+ENCODEKEY	256
+ENDBR	257
+ENQCMD	258
+ENQCMDS	259
+ENTER	260
+ERETS	261
+ERETU	262
+ES_PREFIX	263
+EXTRACTPSmri	264
+EXTRACTPSrri	265
+EXTRACT_SUBREG	266
+EXTRQ	267
+EXTRQI	268
+F	269
+FAKE_USE	270
+FARCALL	271
+FARJMP	272
+FAULTING_OP	273
+FBLDm	274
+FBSTPm	275
+FCOM	276
+FCOMP	277
+FCOMPP	278
+FCOS	279
+FDECSTP	280
+FEMMS	281
+FENTRY_CALL	282
+FFREE	283
+FFREEP	284
+FICOM	285
+FICOMP	286
+FINCSTP	287
+FLDCW	288
+FLDENVm	289
+FLDL	290
+FLDLG	291
+FLDLN	292
+FLDPI	293
+FNCLEX	294
+FNINIT	295
+FNOP	296
+FNSTCW	297
+FNSTSW	298
+FNSTSWm	299
+FP	300
+FPATAN	301
+FPREM	302
+FPTAN	303
+FRNDINT	304
+FRSTORm	305
+FSAVEm	306
+FSCALE	307
+FSIN	308
+FSINCOS	309
+FSTENVm	310
+FS_PREFIX	311
+FXRSTOR	312
+FXSAVE	313
+FXTRACT	314
+FYL	315
+FsFLD	316
+GC_LABEL	317
+GETSEC	318
+GF	319
+GS_PREFIX	320
+G_ABDS	321
+G_ABDU	322
+G_ABS	323
+G_ADD	324
+G_ADDRSPACE_CAST	325
+G_AND	326
+G_ANYEXT	327
+G_ASHR	328
+G_ASSERT_ALIGN	329
+G_ASSERT_SEXT	330
+G_ASSERT_ZEXT	331
+G_ATOMICRMW_ADD	332
+G_ATOMICRMW_AND	333
+G_ATOMICRMW_FADD	334
+G_ATOMICRMW_FMAX	335
+G_ATOMICRMW_FMAXIMUM	336
+G_ATOMICRMW_FMIN	337
+G_ATOMICRMW_FMINIMUM	338
+G_ATOMICRMW_FSUB	339
+G_ATOMICRMW_MAX	340
+G_ATOMICRMW_MIN	341
+G_ATOMICRMW_NAND	342
+G_ATOMICRMW_OR	343
+G_ATOMICRMW_SUB	344
+G_ATOMICRMW_UDEC_WRAP	345
+G_ATOMICRMW_UINC_WRAP	346
+G_ATOMICRMW_UMAX	347
+G_ATOMICRMW_UMIN	348
+G_ATOMICRMW_USUB_COND	349
+G_ATOMICRMW_USUB_SAT	350
+G_ATOMICRMW_XCHG	351
+G_ATOMICRMW_XOR	352
+G_ATOMIC_CMPXCHG	353
+G_ATOMIC_CMPXCHG_WITH_SUCCESS	354
+G_BITCAST	355
+G_BITREVERSE	356
+G_BLOCK_ADDR	357
+G_BR	358
+G_BRCOND	359
+G_BRINDIRECT	360
+G_BRJT	361
+G_BSWAP	362
+G_BUILD_VECTOR	363
+G_BUILD_VECTOR_TRUNC	364
+G_BZERO	365
+G_CONCAT_VECTORS	366
+G_CONSTANT	367
+G_CONSTANT_FOLD_BARRIER	368
+G_CONSTANT_POOL	369
+G_CTLZ	370
+G_CTLZ_ZERO_UNDEF	371
+G_CTPOP	372
+G_CTTZ	373
+G_CTTZ_ZERO_UNDEF	374
+G_DEBUGTRAP	375
+G_DYN_STACKALLOC	376
+G_EXTRACT	377
+G_EXTRACT_SUBVECTOR	378
+G_EXTRACT_VECTOR_ELT	379
+G_FABS	380
+G_FACOS	381
+G_FADD	382
+G_FASIN	383
+G_FATAN	384
+G_FCANONICALIZE	385
+G_FCEIL	386
+G_FCMP	387
+G_FCONSTANT	388
+G_FCOPYSIGN	389
+G_FCOS	390
+G_FCOSH	391
+G_FDIV	392
+G_FENCE	393
+G_FEXP	394
+G_FFLOOR	395
+G_FFREXP	396
+G_FILD	397
+G_FIST	398
+G_FLDCW	399
+G_FLDEXP	400
+G_FLOG	401
+G_FMA	402
+G_FMAD	403
+G_FMAXIMUM	404
+G_FMAXIMUMNUM	405
+G_FMAXNUM	406
+G_FMAXNUM_IEEE	407
+G_FMINIMUM	408
+G_FMINIMUMNUM	409
+G_FMINNUM	410
+G_FMINNUM_IEEE	411
+G_FMODF	412
+G_FMUL	413
+G_FNEARBYINT	414
+G_FNEG	415
+G_FNSTCW	416
+G_FPEXT	417
+G_FPOW	418
+G_FPOWI	419
+G_FPTOSI	420
+G_FPTOSI_SAT	421
+G_FPTOUI	422
+G_FPTOUI_SAT	423
+G_FPTRUNC	424
+G_FRAME_INDEX	425
+G_FREEZE	426
+G_FREM	427
+G_FRINT	428
+G_FSHL	429
+G_FSHR	430
+G_FSIN	431
+G_FSINCOS	432
+G_FSINH	433
+G_FSQRT	434
+G_FSUB	435
+G_FTAN	436
+G_FTANH	437
+G_GET_FPENV	438
+G_GET_FPMODE	439
+G_GET_ROUNDING	440
+G_GLOBAL_VALUE	441
+G_ICMP	442
+G_IMPLICIT_DEF	443
+G_INDEXED_LOAD	444
+G_INDEXED_SEXTLOAD	445
+G_INDEXED_STORE	446
+G_INDEXED_ZEXTLOAD	447
+G_INSERT	448
+G_INSERT_SUBVECTOR	449
+G_INSERT_VECTOR_ELT	450
+G_INTRINSIC	451
+G_INTRINSIC_CONVERGENT	452
+G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS	453
+G_INTRINSIC_FPTRUNC_ROUND	454
+G_INTRINSIC_LLRINT	455
+G_INTRINSIC_LRINT	456
+G_INTRINSIC_ROUND	457
+G_INTRINSIC_ROUNDEVEN	458
+G_INTRINSIC_TRUNC	459
+G_INTRINSIC_W_SIDE_EFFECTS	460
+G_INTTOPTR	461
+G_INVOKE_REGION_START	462
+G_IS_FPCLASS	463
+G_JUMP_TABLE	464
+G_LLROUND	465
+G_LOAD	466
+G_LROUND	467
+G_LSHR	468
+G_MEMCPY	469
+G_MEMCPY_INLINE	470
+G_MEMMOVE	471
+G_MEMSET	472
+G_MERGE_VALUES	473
+G_MUL	474
+G_OR	475
+G_PHI	476
+G_PREFETCH	477
+G_PTRAUTH_GLOBAL_VALUE	478
+G_PTRMASK	479
+G_PTRTOINT	480
+G_PTR_ADD	481
+G_READCYCLECOUNTER	482
+G_READSTEADYCOUNTER	483
+G_READ_REGISTER	484
+G_RESET_FPENV	485
+G_RESET_FPMODE	486
+G_ROTL	487
+G_ROTR	488
+G_SADDE	489
+G_SADDO	490
+G_SADDSAT	491
+G_SBFX	492
+G_SCMP	493
+G_SDIV	494
+G_SDIVFIX	495
+G_SDIVFIXSAT	496
+G_SDIVREM	497
+G_SELECT	498
+G_SET_FPENV	499
+G_SET_FPMODE	500
+G_SET_ROUNDING	501
+G_SEXT	502
+G_SEXTLOAD	503
+G_SEXT_INREG	504
+G_SHL	505
+G_SHUFFLE_VECTOR	506
+G_SITOFP	507
+G_SMAX	508
+G_SMIN	509
+G_SMULFIX	510
+G_SMULFIXSAT	511
+G_SMULH	512
+G_SMULO	513
+G_SPLAT_VECTOR	514
+G_SREM	515
+G_SSHLSAT	516
+G_SSUBE	517
+G_SSUBO	518
+G_SSUBSAT	519
+G_STACKRESTORE	520
+G_STACKSAVE	521
+G_STEP_VECTOR	522
+G_STORE	523
+G_STRICT_FADD	524
+G_STRICT_FDIV	525
+G_STRICT_FLDEXP	526
+G_STRICT_FMA	527
+G_STRICT_FMUL	528
+G_STRICT_FREM	529
+G_STRICT_FSQRT	530
+G_STRICT_FSUB	531
+G_SUB	532
+G_TRAP	533
+G_TRUNC	534
+G_TRUNC_SSAT_S	535
+G_TRUNC_SSAT_U	536
+G_TRUNC_USAT_U	537
+G_UADDE	538
+G_UADDO	539
+G_UADDSAT	540
+G_UBFX	541
+G_UBSANTRAP	542
+G_UCMP	543
+G_UDIV	544
+G_UDIVFIX	545
+G_UDIVFIXSAT	546
+G_UDIVREM	547
+G_UITOFP	548
+G_UMAX	549
+G_UMIN	550
+G_UMULFIX	551
+G_UMULFIXSAT	552
+G_UMULH	553
+G_UMULO	554
+G_UNMERGE_VALUES	555
+G_UREM	556
+G_USHLSAT	557
+G_USUBE	558
+G_USUBO	559
+G_USUBSAT	560
+G_VAARG	561
+G_VASTART	562
+G_VECREDUCE_ADD	563
+G_VECREDUCE_AND	564
+G_VECREDUCE_FADD	565
+G_VECREDUCE_FMAX	566
+G_VECREDUCE_FMAXIMUM	567
+G_VECREDUCE_FMIN	568
+G_VECREDUCE_FMINIMUM	569
+G_VECREDUCE_FMUL	570
+G_VECREDUCE_MUL	571
+G_VECREDUCE_OR	572
+G_VECREDUCE_SEQ_FADD	573
+G_VECREDUCE_SEQ_FMUL	574
+G_VECREDUCE_SMAX	575
+G_VECREDUCE_SMIN	576
+G_VECREDUCE_UMAX	577
+G_VECREDUCE_UMIN	578
+G_VECREDUCE_XOR	579
+G_VECTOR_COMPRESS	580
+G_VSCALE	581
+G_WRITE_REGISTER	582
+G_XOR	583
+G_ZEXT	584
+G_ZEXTLOAD	585
+HADDPDrm	586
+HADDPDrr	587
+HADDPSrm	588
+HADDPSrr	589
+HLT	590
+HRESET	591
+HSUBPDrm	592
+HSUBPDrr	593
+HSUBPSrm	594
+HSUBPSrr	595
+ICALL_BRANCH_FUNNEL	596
+IDIV	597
+ILD_F	598
+ILD_Fp	599
+IMPLICIT_DEF	600
+IMUL	601
+IMULZU	602
+IN	603
+INC	604
+INCSSPD	605
+INCSSPQ	606
+INDIRECT_THUNK_CALL	607
+INDIRECT_THUNK_TCRETURN	608
+INIT_UNDEF	609
+INLINEASM	610
+INLINEASM_BR	611
+INSB	612
+INSERTPSrmi	613
+INSERTPSrri	614
+INSERTQ	615
+INSERTQI	616
+INSERT_SUBREG	617
+INSL	618
+INSW	619
+INT	620
+INTO	621
+INVD	622
+INVEPT	623
+INVLPG	624
+INVLPGA	625
+INVLPGB	626
+INVPCID	627
+INVVPID	628
+IRET	629
+ISTT_FP	630
+ISTT_Fp	631
+IST_F	632
+IST_FP	633
+IST_Fp	634
+Int_eh_sjlj_setup_dispatch	635
+JCC	636
+JCXZ	637
+JECXZ	638
+JMP	639
+JMPABS	640
+JRCXZ	641
+JUMP_TABLE_DEBUG_INFO	642
+KADDBkk	643
+KADDDkk	644
+KADDQkk	645
+KADDWkk	646
+KANDBkk	647
+KANDDkk	648
+KANDNBkk	649
+KANDNDkk	650
+KANDNQkk	651
+KANDNWkk	652
+KANDQkk	653
+KANDWkk	654
+KCFI_CHECK	655
+KILL	656
+KMOVBkk	657
+KMOVBkk_EVEX	658
+KMOVBkm	659
+KMOVBkm_EVEX	660
+KMOVBkr	661
+KMOVBkr_EVEX	662
+KMOVBmk	663
+KMOVBmk_EVEX	664
+KMOVBrk	665
+KMOVBrk_EVEX	666
+KMOVDkk	667
+KMOVDkk_EVEX	668
+KMOVDkm	669
+KMOVDkm_EVEX	670
+KMOVDkr	671
+KMOVDkr_EVEX	672
+KMOVDmk	673
+KMOVDmk_EVEX	674
+KMOVDrk	675
+KMOVDrk_EVEX	676
+KMOVQkk	677
+KMOVQkk_EVEX	678
+KMOVQkm	679
+KMOVQkm_EVEX	680
+KMOVQkr	681
+KMOVQkr_EVEX	682
+KMOVQmk	683
+KMOVQmk_EVEX	684
+KMOVQrk	685
+KMOVQrk_EVEX	686
+KMOVWkk	687
+KMOVWkk_EVEX	688
+KMOVWkm	689
+KMOVWkm_EVEX	690
+KMOVWkr	691
+KMOVWkr_EVEX	692
+KMOVWmk	693
+KMOVWmk_EVEX	694
+KMOVWrk	695
+KMOVWrk_EVEX	696
+KNOTBkk	697
+KNOTDkk	698
+KNOTQkk	699
+KNOTWkk	700
+KORBkk	701
+KORDkk	702
+KORQkk	703
+KORTESTBkk	704
+KORTESTDkk	705
+KORTESTQkk	706
+KORTESTWkk	707
+KORWkk	708
+KSET	709
+KSHIFTLBki	710
+KSHIFTLDki	711
+KSHIFTLQki	712
+KSHIFTLWki	713
+KSHIFTRBki	714
+KSHIFTRDki	715
+KSHIFTRQki	716
+KSHIFTRWki	717
+KTESTBkk	718
+KTESTDkk	719
+KTESTQkk	720
+KTESTWkk	721
+KUNPCKBWkk	722
+KUNPCKDQkk	723
+KUNPCKWDkk	724
+KXNORBkk	725
+KXNORDkk	726
+KXNORQkk	727
+KXNORWkk	728
+KXORBkk	729
+KXORDkk	730
+KXORQkk	731
+KXORWkk	732
+LAHF	733
+LAR	734
+LCMPXCHG	735
+LDDQUrm	736
+LDMXCSR	737
+LDS	738
+LDTILECFG	739
+LDTILECFG_EVEX	740
+LD_F	741
+LD_Fp	742
+LD_Frr	743
+LEA	744
+LEAVE	745
+LES	746
+LFENCE	747
+LFS	748
+LGDT	749
+LGS	750
+LIDT	751
+LIFETIME_END	752
+LIFETIME_START	753
+LKGS	754
+LLDT	755
+LLWPCB	756
+LMSW	757
+LOADIWKEY	758
+LOAD_STACK_GUARD	759
+LOCAL_ESCAPE	760
+LOCK_ADD	761
+LOCK_AND	762
+LOCK_BTC	763
+LOCK_BTC_RM	764
+LOCK_BTR	765
+LOCK_BTR_RM	766
+LOCK_BTS	767
+LOCK_BTS_RM	768
+LOCK_DEC	769
+LOCK_INC	770
+LOCK_OR	771
+LOCK_PREFIX	772
+LOCK_SUB	773
+LOCK_XOR	774
+LODSB	775
+LODSL	776
+LODSQ	777
+LODSW	778
+LOOP	779
+LOOPE	780
+LOOPNE	781
+LRET	782
+LRETI	783
+LSL	784
+LSS	785
+LTRm	786
+LTRr	787
+LWPINS	788
+LWPVAL	789
+LXADD	790
+LZCNT	791
+MASKMOVDQU	792
+MASKPAIR	793
+MAXCPDrm	794
+MAXCPDrr	795
+MAXCPSrm	796
+MAXCPSrr	797
+MAXCSDrm	798
+MAXCSDrr	799
+MAXCSSrm	800
+MAXCSSrr	801
+MAXPDrm	802
+MAXPDrr	803
+MAXPSrm	804
+MAXPSrr	805
+MAXSDrm	806
+MAXSDrm_Int	807
+MAXSDrr	808
+MAXSDrr_Int	809
+MAXSSrm	810
+MAXSSrm_Int	811
+MAXSSrr	812
+MAXSSrr_Int	813
+MEMBARRIER	814
+MFENCE	815
+MINCPDrm	816
+MINCPDrr	817
+MINCPSrm	818
+MINCPSrr	819
+MINCSDrm	820
+MINCSDrr	821
+MINCSSrm	822
+MINCSSrr	823
+MINPDrm	824
+MINPDrr	825
+MINPSrm	826
+MINPSrr	827
+MINSDrm	828
+MINSDrm_Int	829
+MINSDrr	830
+MINSDrr_Int	831
+MINSSrm	832
+MINSSrm_Int	833
+MINSSrr	834
+MINSSrr_Int	835
+MMX_CVTPD	836
+MMX_CVTPI	837
+MMX_CVTPS	838
+MMX_CVTTPD	839
+MMX_CVTTPS	840
+MMX_EMMS	841
+MMX_MASKMOVQ	842
+MMX_MOVD	843
+MMX_MOVDQ	844
+MMX_MOVFR	845
+MMX_MOVNTQmr	846
+MMX_MOVQ	847
+MMX_PABSBrm	848
+MMX_PABSBrr	849
+MMX_PABSDrm	850
+MMX_PABSDrr	851
+MMX_PABSWrm	852
+MMX_PABSWrr	853
+MMX_PACKSSDWrm	854
+MMX_PACKSSDWrr	855
+MMX_PACKSSWBrm	856
+MMX_PACKSSWBrr	857
+MMX_PACKUSWBrm	858
+MMX_PACKUSWBrr	859
+MMX_PADDBrm	860
+MMX_PADDBrr	861
+MMX_PADDDrm	862
+MMX_PADDDrr	863
+MMX_PADDQrm	864
+MMX_PADDQrr	865
+MMX_PADDSBrm	866
+MMX_PADDSBrr	867
+MMX_PADDSWrm	868
+MMX_PADDSWrr	869
+MMX_PADDUSBrm	870
+MMX_PADDUSBrr	871
+MMX_PADDUSWrm	872
+MMX_PADDUSWrr	873
+MMX_PADDWrm	874
+MMX_PADDWrr	875
+MMX_PALIGNRrmi	876
+MMX_PALIGNRrri	877
+MMX_PANDNrm	878
+MMX_PANDNrr	879
+MMX_PANDrm	880
+MMX_PANDrr	881
+MMX_PAVGBrm	882
+MMX_PAVGBrr	883
+MMX_PAVGWrm	884
+MMX_PAVGWrr	885
+MMX_PCMPEQBrm	886
+MMX_PCMPEQBrr	887
+MMX_PCMPEQDrm	888
+MMX_PCMPEQDrr	889
+MMX_PCMPEQWrm	890
+MMX_PCMPEQWrr	891
+MMX_PCMPGTBrm	892
+MMX_PCMPGTBrr	893
+MMX_PCMPGTDrm	894
+MMX_PCMPGTDrr	895
+MMX_PCMPGTWrm	896
+MMX_PCMPGTWrr	897
+MMX_PEXTRWrri	898
+MMX_PHADDDrm	899
+MMX_PHADDDrr	900
+MMX_PHADDSWrm	901
+MMX_PHADDSWrr	902
+MMX_PHADDWrm	903
+MMX_PHADDWrr	904
+MMX_PHSUBDrm	905
+MMX_PHSUBDrr	906
+MMX_PHSUBSWrm	907
+MMX_PHSUBSWrr	908
+MMX_PHSUBWrm	909
+MMX_PHSUBWrr	910
+MMX_PINSRWrmi	911
+MMX_PINSRWrri	912
+MMX_PMADDUBSWrm	913
+MMX_PMADDUBSWrr	914
+MMX_PMADDWDrm	915
+MMX_PMADDWDrr	916
+MMX_PMAXSWrm	917
+MMX_PMAXSWrr	918
+MMX_PMAXUBrm	919
+MMX_PMAXUBrr	920
+MMX_PMINSWrm	921
+MMX_PMINSWrr	922
+MMX_PMINUBrm	923
+MMX_PMINUBrr	924
+MMX_PMOVMSKBrr	925
+MMX_PMULHRSWrm	926
+MMX_PMULHRSWrr	927
+MMX_PMULHUWrm	928
+MMX_PMULHUWrr	929
+MMX_PMULHWrm	930
+MMX_PMULHWrr	931
+MMX_PMULLWrm	932
+MMX_PMULLWrr	933
+MMX_PMULUDQrm	934
+MMX_PMULUDQrr	935
+MMX_PORrm	936
+MMX_PORrr	937
+MMX_PSADBWrm	938
+MMX_PSADBWrr	939
+MMX_PSHUFBrm	940
+MMX_PSHUFBrr	941
+MMX_PSHUFWmi	942
+MMX_PSHUFWri	943
+MMX_PSIGNBrm	944
+MMX_PSIGNBrr	945
+MMX_PSIGNDrm	946
+MMX_PSIGNDrr	947
+MMX_PSIGNWrm	948
+MMX_PSIGNWrr	949
+MMX_PSLLDri	950
+MMX_PSLLDrm	951
+MMX_PSLLDrr	952
+MMX_PSLLQri	953
+MMX_PSLLQrm	954
+MMX_PSLLQrr	955
+MMX_PSLLWri	956
+MMX_PSLLWrm	957
+MMX_PSLLWrr	958
+MMX_PSRADri	959
+MMX_PSRADrm	960
+MMX_PSRADrr	961
+MMX_PSRAWri	962
+MMX_PSRAWrm	963
+MMX_PSRAWrr	964
+MMX_PSRLDri	965
+MMX_PSRLDrm	966
+MMX_PSRLDrr	967
+MMX_PSRLQri	968
+MMX_PSRLQrm	969
+MMX_PSRLQrr	970
+MMX_PSRLWri	971
+MMX_PSRLWrm	972
+MMX_PSRLWrr	973
+MMX_PSUBBrm	974
+MMX_PSUBBrr	975
+MMX_PSUBDrm	976
+MMX_PSUBDrr	977
+MMX_PSUBQrm	978
+MMX_PSUBQrr	979
+MMX_PSUBSBrm	980
+MMX_PSUBSBrr	981
+MMX_PSUBSWrm	982
+MMX_PSUBSWrr	983
+MMX_PSUBUSBrm	984
+MMX_PSUBUSBrr	985
+MMX_PSUBUSWrm	986
+MMX_PSUBUSWrr	987
+MMX_PSUBWrm	988
+MMX_PSUBWrr	989
+MMX_PUNPCKHBWrm	990
+MMX_PUNPCKHBWrr	991
+MMX_PUNPCKHDQrm	992
+MMX_PUNPCKHDQrr	993
+MMX_PUNPCKHWDrm	994
+MMX_PUNPCKHWDrr	995
+MMX_PUNPCKLBWrm	996
+MMX_PUNPCKLBWrr	997
+MMX_PUNPCKLDQrm	998
+MMX_PUNPCKLDQrr	999
+MMX_PUNPCKLWDrm	1000
+MMX_PUNPCKLWDrr	1001
+MMX_PXORrm	1002
+MMX_PXORrr	1003
+MMX_SET	1004
+MONITOR	1005
+MONITORX	1006
+MONTMUL	1007
+MORESTACK_RET	1008
+MORESTACK_RET_RESTORE_R	1009
+MOV	1010
+MOVAPDmr	1011
+MOVAPDrm	1012
+MOVAPDrr	1013
+MOVAPDrr_REV	1014
+MOVAPSmr	1015
+MOVAPSrm	1016
+MOVAPSrr	1017
+MOVAPSrr_REV	1018
+MOVBE	1019
+MOVDDUPrm	1020
+MOVDDUPrr	1021
+MOVDI	1022
+MOVDIR	1023
+MOVDIRI	1024
+MOVDQAmr	1025
+MOVDQArm	1026
+MOVDQArr	1027
+MOVDQArr_REV	1028
+MOVDQUmr	1029
+MOVDQUrm	1030
+MOVDQUrr	1031
+MOVDQUrr_REV	1032
+MOVHLPSrr	1033
+MOVHPDmr	1034
+MOVHPDrm	1035
+MOVHPSmr	1036
+MOVHPSrm	1037
+MOVLHPSrr	1038
+MOVLPDmr	1039
+MOVLPDrm	1040
+MOVLPSmr	1041
+MOVLPSrm	1042
+MOVMSKPDrr	1043
+MOVMSKPSrr	1044
+MOVNTDQArm	1045
+MOVNTDQmr	1046
+MOVNTI	1047
+MOVNTImr	1048
+MOVNTPDmr	1049
+MOVNTPSmr	1050
+MOVNTSD	1051
+MOVNTSS	1052
+MOVPC	1053
+MOVPDI	1054
+MOVPQI	1055
+MOVPQIto	1056
+MOVQI	1057
+MOVRS	1058
+MOVSB	1059
+MOVSDmr	1060
+MOVSDrm	1061
+MOVSDrm_alt	1062
+MOVSDrr	1063
+MOVSDrr_REV	1064
+MOVSDto	1065
+MOVSHDUPrm	1066
+MOVSHDUPrr	1067
+MOVSHPmr	1068
+MOVSHPrm	1069
+MOVSL	1070
+MOVSLDUPrm	1071
+MOVSLDUPrr	1072
+MOVSQ	1073
+MOVSS	1074
+MOVSSmr	1075
+MOVSSrm	1076
+MOVSSrm_alt	1077
+MOVSSrr	1078
+MOVSSrr_REV	1079
+MOVSW	1080
+MOVSX	1081
+MOVUPDmr	1082
+MOVUPDrm	1083
+MOVUPDrr	1084
+MOVUPDrr_REV	1085
+MOVUPSmr	1086
+MOVUPSrm	1087
+MOVUPSrr	1088
+MOVUPSrr_REV	1089
+MOVZPQILo	1090
+MOVZX	1091
+MPSADBWrmi	1092
+MPSADBWrri	1093
+MUL	1094
+MULPDrm	1095
+MULPDrr	1096
+MULPSrm	1097
+MULPSrr	1098
+MULSDrm	1099
+MULSDrm_Int	1100
+MULSDrr	1101
+MULSDrr_Int	1102
+MULSSrm	1103
+MULSSrm_Int	1104
+MULSSrr	1105
+MULSSrr_Int	1106
+MULX	1107
+MUL_F	1108
+MUL_FI	1109
+MUL_FPrST	1110
+MUL_FST	1111
+MUL_Fp	1112
+MUL_FpI	1113
+MUL_FrST	1114
+MWAITX	1115
+MWAITX_SAVE_RBX	1116
+MWAITXrrr	1117
+MWAITrr	1118
+NEG	1119
+NOOP	1120
+NOOPL	1121
+NOOPLr	1122
+NOOPQ	1123
+NOOPQr	1124
+NOOPW	1125
+NOOPWr	1126
+NOT	1127
+OR	1128
+ORPDrm	1129
+ORPDrr	1130
+ORPSrm	1131
+ORPSrr	1132
+OUT	1133
+OUTSB	1134
+OUTSL	1135
+OUTSW	1136
+PABSBrm	1137
+PABSBrr	1138
+PABSDrm	1139
+PABSDrr	1140
+PABSWrm	1141
+PABSWrr	1142
+PACKSSDWrm	1143
+PACKSSDWrr	1144
+PACKSSWBrm	1145
+PACKSSWBrr	1146
+PACKUSDWrm	1147
+PACKUSDWrr	1148
+PACKUSWBrm	1149
+PACKUSWBrr	1150
+PADDBrm	1151
+PADDBrr	1152
+PADDDrm	1153
+PADDDrr	1154
+PADDQrm	1155
+PADDQrr	1156
+PADDSBrm	1157
+PADDSBrr	1158
+PADDSWrm	1159
+PADDSWrr	1160
+PADDUSBrm	1161
+PADDUSBrr	1162
+PADDUSWrm	1163
+PADDUSWrr	1164
+PADDWrm	1165
+PADDWrr	1166
+PALIGNRrmi	1167
+PALIGNRrri	1168
+PANDNrm	1169
+PANDNrr	1170
+PANDrm	1171
+PANDrr	1172
+PATCHABLE_EVENT_CALL	1173
+PATCHABLE_FUNCTION_ENTER	1174
+PATCHABLE_FUNCTION_EXIT	1175
+PATCHABLE_OP	1176
+PATCHABLE_RET	1177
+PATCHABLE_TAIL_CALL	1178
+PATCHABLE_TYPED_EVENT_CALL	1179
+PATCHPOINT	1180
+PAUSE	1181
+PAVGBrm	1182
+PAVGBrr	1183
+PAVGUSBrm	1184
+PAVGUSBrr	1185
+PAVGWrm	1186
+PAVGWrr	1187
+PBLENDVBrm	1188
+PBLENDVBrr	1189
+PBLENDWrmi	1190
+PBLENDWrri	1191
+PBNDKB	1192
+PCLMULQDQrmi	1193
+PCLMULQDQrri	1194
+PCMPEQBrm	1195
+PCMPEQBrr	1196
+PCMPEQDrm	1197
+PCMPEQDrr	1198
+PCMPEQQrm	1199
+PCMPEQQrr	1200
+PCMPEQWrm	1201
+PCMPEQWrr	1202
+PCMPESTRIrmi	1203
+PCMPESTRIrri	1204
+PCMPESTRMrmi	1205
+PCMPESTRMrri	1206
+PCMPGTBrm	1207
+PCMPGTBrr	1208
+PCMPGTDrm	1209
+PCMPGTDrr	1210
+PCMPGTQrm	1211
+PCMPGTQrr	1212
+PCMPGTWrm	1213
+PCMPGTWrr	1214
+PCMPISTRIrmi	1215
+PCMPISTRIrri	1216
+PCMPISTRMrmi	1217
+PCMPISTRMrri	1218
+PCONFIG	1219
+PDEP	1220
+PEXT	1221
+PEXTRBmri	1222
+PEXTRBrri	1223
+PEXTRDmri	1224
+PEXTRDrri	1225
+PEXTRQmri	1226
+PEXTRQrri	1227
+PEXTRWmri	1228
+PEXTRWrri	1229
+PEXTRWrri_REV	1230
+PF	1231
+PFACCrm	1232
+PFACCrr	1233
+PFADDrm	1234
+PFADDrr	1235
+PFCMPEQrm	1236
+PFCMPEQrr	1237
+PFCMPGErm	1238
+PFCMPGErr	1239
+PFCMPGTrm	1240
+PFCMPGTrr	1241
+PFMAXrm	1242
+PFMAXrr	1243
+PFMINrm	1244
+PFMINrr	1245
+PFMULrm	1246
+PFMULrr	1247
+PFNACCrm	1248
+PFNACCrr	1249
+PFPNACCrm	1250
+PFPNACCrr	1251
+PFRCPIT	1252
+PFRCPrm	1253
+PFRCPrr	1254
+PFRSQIT	1255
+PFRSQRTrm	1256
+PFRSQRTrr	1257
+PFSUBRrm	1258
+PFSUBRrr	1259
+PFSUBrm	1260
+PFSUBrr	1261
+PHADDDrm	1262
+PHADDDrr	1263
+PHADDSWrm	1264
+PHADDSWrr	1265
+PHADDWrm	1266
+PHADDWrr	1267
+PHI	1268
+PHMINPOSUWrm	1269
+PHMINPOSUWrr	1270
+PHSUBDrm	1271
+PHSUBDrr	1272
+PHSUBSWrm	1273
+PHSUBSWrr	1274
+PHSUBWrm	1275
+PHSUBWrr	1276
+PI	1277
+PINSRBrmi	1278
+PINSRBrri	1279
+PINSRDrmi	1280
+PINSRDrri	1281
+PINSRQrmi	1282
+PINSRQrri	1283
+PINSRWrmi	1284
+PINSRWrri	1285
+PLDTILECFGV	1286
+PLEA	1287
+PMADDUBSWrm	1288
+PMADDUBSWrr	1289
+PMADDWDrm	1290
+PMADDWDrr	1291
+PMAXSBrm	1292
+PMAXSBrr	1293
+PMAXSDrm	1294
+PMAXSDrr	1295
+PMAXSWrm	1296
+PMAXSWrr	1297
+PMAXUBrm	1298
+PMAXUBrr	1299
+PMAXUDrm	1300
+PMAXUDrr	1301
+PMAXUWrm	1302
+PMAXUWrr	1303
+PMINSBrm	1304
+PMINSBrr	1305
+PMINSDrm	1306
+PMINSDrr	1307
+PMINSWrm	1308
+PMINSWrr	1309
+PMINUBrm	1310
+PMINUBrr	1311
+PMINUDrm	1312
+PMINUDrr	1313
+PMINUWrm	1314
+PMINUWrr	1315
+PMOVMSKBrr	1316
+PMOVSXBDrm	1317
+PMOVSXBDrr	1318
+PMOVSXBQrm	1319
+PMOVSXBQrr	1320
+PMOVSXBWrm	1321
+PMOVSXBWrr	1322
+PMOVSXDQrm	1323
+PMOVSXDQrr	1324
+PMOVSXWDrm	1325
+PMOVSXWDrr	1326
+PMOVSXWQrm	1327
+PMOVSXWQrr	1328
+PMOVZXBDrm	1329
+PMOVZXBDrr	1330
+PMOVZXBQrm	1331
+PMOVZXBQrr	1332
+PMOVZXBWrm	1333
+PMOVZXBWrr	1334
+PMOVZXDQrm	1335
+PMOVZXDQrr	1336
+PMOVZXWDrm	1337
+PMOVZXWDrr	1338
+PMOVZXWQrm	1339
+PMOVZXWQrr	1340
+PMULDQrm	1341
+PMULDQrr	1342
+PMULHRSWrm	1343
+PMULHRSWrr	1344
+PMULHRWrm	1345
+PMULHRWrr	1346
+PMULHUWrm	1347
+PMULHUWrr	1348
+PMULHWrm	1349
+PMULHWrr	1350
+PMULLDrm	1351
+PMULLDrr	1352
+PMULLWrm	1353
+PMULLWrr	1354
+PMULUDQrm	1355
+PMULUDQrr	1356
+POP	1357
+POPA	1358
+POPCNT	1359
+POPDS	1360
+POPES	1361
+POPF	1362
+POPFS	1363
+POPGS	1364
+POPP	1365
+POPSS	1366
+PORrm	1367
+PORrr	1368
+PREALLOCATED_ARG	1369
+PREALLOCATED_SETUP	1370
+PREFETCH	1371
+PREFETCHIT	1372
+PREFETCHNTA	1373
+PREFETCHRST	1374
+PREFETCHT	1375
+PREFETCHW	1376
+PREFETCHWT	1377
+PROBED_ALLOCA	1378
+PSADBWrm	1379
+PSADBWrr	1380
+PSEUDO_PROBE	1381
+PSHUFBrm	1382
+PSHUFBrr	1383
+PSHUFDmi	1384
+PSHUFDri	1385
+PSHUFHWmi	1386
+PSHUFHWri	1387
+PSHUFLWmi	1388
+PSHUFLWri	1389
+PSIGNBrm	1390
+PSIGNBrr	1391
+PSIGNDrm	1392
+PSIGNDrr	1393
+PSIGNWrm	1394
+PSIGNWrr	1395
+PSLLDQri	1396
+PSLLDri	1397
+PSLLDrm	1398
+PSLLDrr	1399
+PSLLQri	1400
+PSLLQrm	1401
+PSLLQrr	1402
+PSLLWri	1403
+PSLLWrm	1404
+PSLLWrr	1405
+PSMASH	1406
+PSRADri	1407
+PSRADrm	1408
+PSRADrr	1409
+PSRAWri	1410
+PSRAWrm	1411
+PSRAWrr	1412
+PSRLDQri	1413
+PSRLDri	1414
+PSRLDrm	1415
+PSRLDrr	1416
+PSRLQri	1417
+PSRLQrm	1418
+PSRLQrr	1419
+PSRLWri	1420
+PSRLWrm	1421
+PSRLWrr	1422
+PSUBBrm	1423
+PSUBBrr	1424
+PSUBDrm	1425
+PSUBDrr	1426
+PSUBQrm	1427
+PSUBQrr	1428
+PSUBSBrm	1429
+PSUBSBrr	1430
+PSUBSWrm	1431
+PSUBSWrr	1432
+PSUBUSBrm	1433
+PSUBUSBrr	1434
+PSUBUSWrm	1435
+PSUBUSWrr	1436
+PSUBWrm	1437
+PSUBWrr	1438
+PSWAPDrm	1439
+PSWAPDrr	1440
+PT	1441
+PTCMMIMFP	1442
+PTCMMRLFP	1443
+PTCONJTCMMIMFP	1444
+PTCONJTFP	1445
+PTCVTROWD	1446
+PTCVTROWPS	1447
+PTDPBF	1448
+PTDPBHF	1449
+PTDPBSSD	1450
+PTDPBSSDV	1451
+PTDPBSUD	1452
+PTDPBSUDV	1453
+PTDPBUSD	1454
+PTDPBUSDV	1455
+PTDPBUUD	1456
+PTDPBUUDV	1457
+PTDPFP	1458
+PTDPHBF	1459
+PTDPHF	1460
+PTESTrm	1461
+PTESTrr	1462
+PTILELOADD	1463
+PTILELOADDRS	1464
+PTILELOADDRST	1465
+PTILELOADDRSV	1466
+PTILELOADDT	1467
+PTILELOADDV	1468
+PTILEMOVROWrre	1469
+PTILEMOVROWrreV	1470
+PTILEMOVROWrri	1471
+PTILEMOVROWrriV	1472
+PTILEPAIRLOAD	1473
+PTILEPAIRSTORE	1474
+PTILESTORED	1475
+PTILESTOREDV	1476
+PTILEZERO	1477
+PTILEZEROV	1478
+PTMMULTF	1479
+PTTCMMIMFP	1480
+PTTCMMRLFP	1481
+PTTDPBF	1482
+PTTDPFP	1483
+PTTMMULTF	1484
+PTTRANSPOSED	1485
+PTTRANSPOSEDV	1486
+PTWRITE	1487
+PTWRITEm	1488
+PTWRITEr	1489
+PUNPCKHBWrm	1490
+PUNPCKHBWrr	1491
+PUNPCKHDQrm	1492
+PUNPCKHDQrr	1493
+PUNPCKHQDQrm	1494
+PUNPCKHQDQrr	1495
+PUNPCKHWDrm	1496
+PUNPCKHWDrr	1497
+PUNPCKLBWrm	1498
+PUNPCKLBWrr	1499
+PUNPCKLDQrm	1500
+PUNPCKLDQrr	1501
+PUNPCKLQDQrm	1502
+PUNPCKLQDQrr	1503
+PUNPCKLWDrm	1504
+PUNPCKLWDrr	1505
+PUSH	1506
+PUSHA	1507
+PUSHCS	1508
+PUSHDS	1509
+PUSHES	1510
+PUSHF	1511
+PUSHFS	1512
+PUSHGS	1513
+PUSHP	1514
+PUSHSS	1515
+PVALIDATE	1516
+PXORrm	1517
+PXORrr	1518
+RCL	1519
+RCPPSm	1520
+RCPPSr	1521
+RCPSSm	1522
+RCPSSm_Int	1523
+RCPSSr	1524
+RCPSSr_Int	1525
+RCR	1526
+RDFLAGS	1527
+RDFSBASE	1528
+RDGSBASE	1529
+RDMSR	1530
+RDMSRLIST	1531
+RDMSRri	1532
+RDMSRri_EVEX	1533
+RDPID	1534
+RDPKRUr	1535
+RDPMC	1536
+RDPRU	1537
+RDRAND	1538
+RDSEED	1539
+RDSSPD	1540
+RDSSPQ	1541
+RDTSC	1542
+RDTSCP	1543
+REG_SEQUENCE	1544
+REPNE_PREFIX	1545
+REP_MOVSB	1546
+REP_MOVSD	1547
+REP_MOVSQ	1548
+REP_MOVSW	1549
+REP_PREFIX	1550
+REP_STOSB	1551
+REP_STOSD	1552
+REP_STOSQ	1553
+REP_STOSW	1554
+RET	1555
+RETI	1556
+REX	1557
+RMPADJUST	1558
+RMPQUERY	1559
+RMPUPDATE	1560
+ROL	1561
+ROR	1562
+RORX	1563
+ROUNDPDmi	1564
+ROUNDPDri	1565
+ROUNDPSmi	1566
+ROUNDPSri	1567
+ROUNDSDmi	1568
+ROUNDSDmi_Int	1569
+ROUNDSDri	1570
+ROUNDSDri_Int	1571
+ROUNDSSmi	1572
+ROUNDSSmi_Int	1573
+ROUNDSSri	1574
+ROUNDSSri_Int	1575
+RSM	1576
+RSQRTPSm	1577
+RSQRTPSr	1578
+RSQRTSSm	1579
+RSQRTSSm_Int	1580
+RSQRTSSr	1581
+RSQRTSSr_Int	1582
+RSTORSSP	1583
+SAHF	1584
+SALC	1585
+SAR	1586
+SARX	1587
+SAVEPREVSSP	1588
+SBB	1589
+SCASB	1590
+SCASL	1591
+SCASQ	1592
+SCASW	1593
+SEAMCALL	1594
+SEAMOPS	1595
+SEAMRET	1596
+SEG_ALLOCA	1597
+SEH_BeginEpilogue	1598
+SEH_EndEpilogue	1599
+SEH_EndPrologue	1600
+SEH_PushFrame	1601
+SEH_PushReg	1602
+SEH_SaveReg	1603
+SEH_SaveXMM	1604
+SEH_SetFrame	1605
+SEH_StackAlign	1606
+SEH_StackAlloc	1607
+SEH_UnwindV	1608
+SEH_UnwindVersion	1609
+SENDUIPI	1610
+SERIALIZE	1611
+SETB_C	1612
+SETCCm	1613
+SETCCm_EVEX	1614
+SETCCr	1615
+SETCCr_EVEX	1616
+SETSSBSY	1617
+SETZUCCm	1618
+SETZUCCr	1619
+SFENCE	1620
+SGDT	1621
+SHA	1622
+SHL	1623
+SHLD	1624
+SHLDROT	1625
+SHLX	1626
+SHR	1627
+SHRD	1628
+SHRDROT	1629
+SHRX	1630
+SHUFPDrmi	1631
+SHUFPDrri	1632
+SHUFPSrmi	1633
+SHUFPSrri	1634
+SIDT	1635
+SKINIT	1636
+SLDT	1637
+SLWPCB	1638
+SMSW	1639
+SQRTPDm	1640
+SQRTPDr	1641
+SQRTPSm	1642
+SQRTPSr	1643
+SQRTSDm	1644
+SQRTSDm_Int	1645
+SQRTSDr	1646
+SQRTSDr_Int	1647
+SQRTSSm	1648
+SQRTSSm_Int	1649
+SQRTSSr	1650
+SQRTSSr_Int	1651
+SQRT_F	1652
+SQRT_Fp	1653
+SS_PREFIX	1654
+STAC	1655
+STACKALLOC_W_PROBING	1656
+STACKMAP	1657
+STATEPOINT	1658
+STC	1659
+STD	1660
+STGI	1661
+STI	1662
+STMXCSR	1663
+STOSB	1664
+STOSL	1665
+STOSQ	1666
+STOSW	1667
+STR	1668
+STRm	1669
+STTILECFG	1670
+STTILECFG_EVEX	1671
+STUI	1672
+ST_F	1673
+ST_FP	1674
+ST_FPrr	1675
+ST_Fp	1676
+ST_FpP	1677
+ST_Frr	1678
+SUB	1679
+SUBPDrm	1680
+SUBPDrr	1681
+SUBPSrm	1682
+SUBPSrr	1683
+SUBREG_TO_REG	1684
+SUBR_F	1685
+SUBR_FI	1686
+SUBR_FPrST	1687
+SUBR_FST	1688
+SUBR_Fp	1689
+SUBR_FpI	1690
+SUBR_FrST	1691
+SUBSDrm	1692
+SUBSDrm_Int	1693
+SUBSDrr	1694
+SUBSDrr_Int	1695
+SUBSSrm	1696
+SUBSSrm_Int	1697
+SUBSSrr	1698
+SUBSSrr_Int	1699
+SUB_F	1700
+SUB_FI	1701
+SUB_FPrST	1702
+SUB_FST	1703
+SUB_Fp	1704
+SUB_FpI	1705
+SUB_FrST	1706
+SWAPGS	1707
+SYSCALL	1708
+SYSENTER	1709
+SYSEXIT	1710
+SYSRET	1711
+T	1712
+TAILJMPd	1713
+TAILJMPd_CC	1714
+TAILJMPm	1715
+TAILJMPr	1716
+TCMMIMFP	1717
+TCMMRLFP	1718
+TCONJTCMMIMFP	1719
+TCONJTFP	1720
+TCRETURN_HIPE	1721
+TCRETURN_WIN	1722
+TCRETURN_WINmi	1723
+TCRETURNdi	1724
+TCRETURNdicc	1725
+TCRETURNmi	1726
+TCRETURNri	1727
+TCVTROWD	1728
+TCVTROWPS	1729
+TDCALL	1730
+TDPBF	1731
+TDPBHF	1732
+TDPBSSD	1733
+TDPBSUD	1734
+TDPBUSD	1735
+TDPBUUD	1736
+TDPFP	1737
+TDPHBF	1738
+TDPHF	1739
+TEST	1740
+TESTUI	1741
+TILELOADD	1742
+TILELOADDRS	1743
+TILELOADDRST	1744
+TILELOADDRS_EVEX	1745
+TILELOADDT	1746
+TILELOADD_EVEX	1747
+TILEMOVROWrre	1748
+TILEMOVROWrri	1749
+TILERELEASE	1750
+TILESTORED	1751
+TILESTORED_EVEX	1752
+TILEZERO	1753
+TLBSYNC	1754
+TLSCall	1755
+TLS_addr	1756
+TLS_addrX	1757
+TLS_base_addr	1758
+TLS_base_addrX	1759
+TLS_desc	1760
+TMMULTF	1761
+TPAUSE	1762
+TRAP	1763
+TST_F	1764
+TST_Fp	1765
+TTCMMIMFP	1766
+TTCMMRLFP	1767
+TTDPBF	1768
+TTDPFP	1769
+TTMMULTF	1770
+TTRANSPOSED	1771
+TZCNT	1772
+TZMSK	1773
+UBSAN_UD	1774
+UCOMISDrm	1775
+UCOMISDrm_Int	1776
+UCOMISDrr	1777
+UCOMISDrr_Int	1778
+UCOMISSrm	1779
+UCOMISSrm_Int	1780
+UCOMISSrr	1781
+UCOMISSrr_Int	1782
+UCOM_FIPr	1783
+UCOM_FIr	1784
+UCOM_FPPr	1785
+UCOM_FPr	1786
+UCOM_FpIr	1787
+UCOM_Fpr	1788
+UCOM_Fr	1789
+UD	1790
+UIRET	1791
+UMONITOR	1792
+UMWAIT	1793
+UNPCKHPDrm	1794
+UNPCKHPDrr	1795
+UNPCKHPSrm	1796
+UNPCKHPSrr	1797
+UNPCKLPDrm	1798
+UNPCKLPDrr	1799
+UNPCKLPSrm	1800
+UNPCKLPSrr	1801
+URDMSRri	1802
+URDMSRri_EVEX	1803
+URDMSRrr	1804
+URDMSRrr_EVEX	1805
+UWRMSRir	1806
+UWRMSRir_EVEX	1807
+UWRMSRrr	1808
+UWRMSRrr_EVEX	1809
+V	1810
+VAARG	1811
+VAARG_X	1812
+VADDBF	1813
+VADDPDYrm	1814
+VADDPDYrr	1815
+VADDPDZ	1816
+VADDPDZrm	1817
+VADDPDZrmb	1818
+VADDPDZrmbk	1819
+VADDPDZrmbkz	1820
+VADDPDZrmk	1821
+VADDPDZrmkz	1822
+VADDPDZrr	1823
+VADDPDZrrb	1824
+VADDPDZrrbk	1825
+VADDPDZrrbkz	1826
+VADDPDZrrk	1827
+VADDPDZrrkz	1828
+VADDPDrm	1829
+VADDPDrr	1830
+VADDPHZ	1831
+VADDPHZrm	1832
+VADDPHZrmb	1833
+VADDPHZrmbk	1834
+VADDPHZrmbkz	1835
+VADDPHZrmk	1836
+VADDPHZrmkz	1837
+VADDPHZrr	1838
+VADDPHZrrb	1839
+VADDPHZrrbk	1840
+VADDPHZrrbkz	1841
+VADDPHZrrk	1842
+VADDPHZrrkz	1843
+VADDPSYrm	1844
+VADDPSYrr	1845
+VADDPSZ	1846
+VADDPSZrm	1847
+VADDPSZrmb	1848
+VADDPSZrmbk	1849
+VADDPSZrmbkz	1850
+VADDPSZrmk	1851
+VADDPSZrmkz	1852
+VADDPSZrr	1853
+VADDPSZrrb	1854
+VADDPSZrrbk	1855
+VADDPSZrrbkz	1856
+VADDPSZrrk	1857
+VADDPSZrrkz	1858
+VADDPSrm	1859
+VADDPSrr	1860
+VADDSDZrm	1861
+VADDSDZrm_Int	1862
+VADDSDZrmk_Int	1863
+VADDSDZrmkz_Int	1864
+VADDSDZrr	1865
+VADDSDZrr_Int	1866
+VADDSDZrrb_Int	1867
+VADDSDZrrbk_Int	1868
+VADDSDZrrbkz_Int	1869
+VADDSDZrrk_Int	1870
+VADDSDZrrkz_Int	1871
+VADDSDrm	1872
+VADDSDrm_Int	1873
+VADDSDrr	1874
+VADDSDrr_Int	1875
+VADDSHZrm	1876
+VADDSHZrm_Int	1877
+VADDSHZrmk_Int	1878
+VADDSHZrmkz_Int	1879
+VADDSHZrr	1880
+VADDSHZrr_Int	1881
+VADDSHZrrb_Int	1882
+VADDSHZrrbk_Int	1883
+VADDSHZrrbkz_Int	1884
+VADDSHZrrk_Int	1885
+VADDSHZrrkz_Int	1886
+VADDSSZrm	1887
+VADDSSZrm_Int	1888
+VADDSSZrmk_Int	1889
+VADDSSZrmkz_Int	1890
+VADDSSZrr	1891
+VADDSSZrr_Int	1892
+VADDSSZrrb_Int	1893
+VADDSSZrrbk_Int	1894
+VADDSSZrrbkz_Int	1895
+VADDSSZrrk_Int	1896
+VADDSSZrrkz_Int	1897
+VADDSSrm	1898
+VADDSSrm_Int	1899
+VADDSSrr	1900
+VADDSSrr_Int	1901
+VADDSUBPDYrm	1902
+VADDSUBPDYrr	1903
+VADDSUBPDrm	1904
+VADDSUBPDrr	1905
+VADDSUBPSYrm	1906
+VADDSUBPSYrr	1907
+VADDSUBPSrm	1908
+VADDSUBPSrr	1909
+VAESDECLASTYrm	1910
+VAESDECLASTYrr	1911
+VAESDECLASTZ	1912
+VAESDECLASTZrm	1913
+VAESDECLASTZrr	1914
+VAESDECLASTrm	1915
+VAESDECLASTrr	1916
+VAESDECYrm	1917
+VAESDECYrr	1918
+VAESDECZ	1919
+VAESDECZrm	1920
+VAESDECZrr	1921
+VAESDECrm	1922
+VAESDECrr	1923
+VAESENCLASTYrm	1924
+VAESENCLASTYrr	1925
+VAESENCLASTZ	1926
+VAESENCLASTZrm	1927
+VAESENCLASTZrr	1928
+VAESENCLASTrm	1929
+VAESENCLASTrr	1930
+VAESENCYrm	1931
+VAESENCYrr	1932
+VAESENCZ	1933
+VAESENCZrm	1934
+VAESENCZrr	1935
+VAESENCrm	1936
+VAESENCrr	1937
+VAESIMCrm	1938
+VAESIMCrr	1939
+VAESKEYGENASSISTrmi	1940
+VAESKEYGENASSISTrri	1941
+VALIGNDZ	1942
+VALIGNDZrmbi	1943
+VALIGNDZrmbik	1944
+VALIGNDZrmbikz	1945
+VALIGNDZrmi	1946
+VALIGNDZrmik	1947
+VALIGNDZrmikz	1948
+VALIGNDZrri	1949
+VALIGNDZrrik	1950
+VALIGNDZrrikz	1951
+VALIGNQZ	1952
+VALIGNQZrmbi	1953
+VALIGNQZrmbik	1954
+VALIGNQZrmbikz	1955
+VALIGNQZrmi	1956
+VALIGNQZrmik	1957
+VALIGNQZrmikz	1958
+VALIGNQZrri	1959
+VALIGNQZrrik	1960
+VALIGNQZrrikz	1961
+VANDNPDYrm	1962
+VANDNPDYrr	1963
+VANDNPDZ	1964
+VANDNPDZrm	1965
+VANDNPDZrmb	1966
+VANDNPDZrmbk	1967
+VANDNPDZrmbkz	1968
+VANDNPDZrmk	1969
+VANDNPDZrmkz	1970
+VANDNPDZrr	1971
+VANDNPDZrrk	1972
+VANDNPDZrrkz	1973
+VANDNPDrm	1974
+VANDNPDrr	1975
+VANDNPSYrm	1976
+VANDNPSYrr	1977
+VANDNPSZ	1978
+VANDNPSZrm	1979
+VANDNPSZrmb	1980
+VANDNPSZrmbk	1981
+VANDNPSZrmbkz	1982
+VANDNPSZrmk	1983
+VANDNPSZrmkz	1984
+VANDNPSZrr	1985
+VANDNPSZrrk	1986
+VANDNPSZrrkz	1987
+VANDNPSrm	1988
+VANDNPSrr	1989
+VANDPDYrm	1990
+VANDPDYrr	1991
+VANDPDZ	1992
+VANDPDZrm	1993
+VANDPDZrmb	1994
+VANDPDZrmbk	1995
+VANDPDZrmbkz	1996
+VANDPDZrmk	1997
+VANDPDZrmkz	1998
+VANDPDZrr	1999
+VANDPDZrrk	2000
+VANDPDZrrkz	2001
+VANDPDrm	2002
+VANDPDrr	2003
+VANDPSYrm	2004
+VANDPSYrr	2005
+VANDPSZ	2006
+VANDPSZrm	2007
+VANDPSZrmb	2008
+VANDPSZrmbk	2009
+VANDPSZrmbkz	2010
+VANDPSZrmk	2011
+VANDPSZrmkz	2012
+VANDPSZrr	2013
+VANDPSZrrk	2014
+VANDPSZrrkz	2015
+VANDPSrm	2016
+VANDPSrr	2017
+VASTART_SAVE_XMM_REGS	2018
+VBCSTNEBF	2019
+VBCSTNESH	2020
+VBLENDMPDZ	2021
+VBLENDMPDZrm	2022
+VBLENDMPDZrmb	2023
+VBLENDMPDZrmbk	2024
+VBLENDMPDZrmbkz	2025
+VBLENDMPDZrmk	2026
+VBLENDMPDZrmkz	2027
+VBLENDMPDZrr	2028
+VBLENDMPDZrrk	2029
+VBLENDMPDZrrkz	2030
+VBLENDMPSZ	2031
+VBLENDMPSZrm	2032
+VBLENDMPSZrmb	2033
+VBLENDMPSZrmbk	2034
+VBLENDMPSZrmbkz	2035
+VBLENDMPSZrmk	2036
+VBLENDMPSZrmkz	2037
+VBLENDMPSZrr	2038
+VBLENDMPSZrrk	2039
+VBLENDMPSZrrkz	2040
+VBLENDPDYrmi	2041
+VBLENDPDYrri	2042
+VBLENDPDrmi	2043
+VBLENDPDrri	2044
+VBLENDPSYrmi	2045
+VBLENDPSYrri	2046
+VBLENDPSrmi	2047
+VBLENDPSrri	2048
+VBLENDVPDYrmr	2049
+VBLENDVPDYrrr	2050
+VBLENDVPDrmr	2051
+VBLENDVPDrrr	2052
+VBLENDVPSYrmr	2053
+VBLENDVPSYrrr	2054
+VBLENDVPSrmr	2055
+VBLENDVPSrrr	2056
+VBROADCASTF	2057
+VBROADCASTI	2058
+VBROADCASTSDYrm	2059
+VBROADCASTSDYrr	2060
+VBROADCASTSDZ	2061
+VBROADCASTSDZrm	2062
+VBROADCASTSDZrmk	2063
+VBROADCASTSDZrmkz	2064
+VBROADCASTSDZrr	2065
+VBROADCASTSDZrrk	2066
+VBROADCASTSDZrrkz	2067
+VBROADCASTSSYrm	2068
+VBROADCASTSSYrr	2069
+VBROADCASTSSZ	2070
+VBROADCASTSSZrm	2071
+VBROADCASTSSZrmk	2072
+VBROADCASTSSZrmkz	2073
+VBROADCASTSSZrr	2074
+VBROADCASTSSZrrk	2075
+VBROADCASTSSZrrkz	2076
+VBROADCASTSSrm	2077
+VBROADCASTSSrr	2078
+VCMPBF	2079
+VCMPPDYrmi	2080
+VCMPPDYrri	2081
+VCMPPDZ	2082
+VCMPPDZrmbi	2083
+VCMPPDZrmbik	2084
+VCMPPDZrmi	2085
+VCMPPDZrmik	2086
+VCMPPDZrri	2087
+VCMPPDZrrib	2088
+VCMPPDZrribk	2089
+VCMPPDZrrik	2090
+VCMPPDrmi	2091
+VCMPPDrri	2092
+VCMPPHZ	2093
+VCMPPHZrmbi	2094
+VCMPPHZrmbik	2095
+VCMPPHZrmi	2096
+VCMPPHZrmik	2097
+VCMPPHZrri	2098
+VCMPPHZrrib	2099
+VCMPPHZrribk	2100
+VCMPPHZrrik	2101
+VCMPPSYrmi	2102
+VCMPPSYrri	2103
+VCMPPSZ	2104
+VCMPPSZrmbi	2105
+VCMPPSZrmbik	2106
+VCMPPSZrmi	2107
+VCMPPSZrmik	2108
+VCMPPSZrri	2109
+VCMPPSZrrib	2110
+VCMPPSZrribk	2111
+VCMPPSZrrik	2112
+VCMPPSrmi	2113
+VCMPPSrri	2114
+VCMPSDZrmi	2115
+VCMPSDZrmi_Int	2116
+VCMPSDZrmik_Int	2117
+VCMPSDZrri	2118
+VCMPSDZrri_Int	2119
+VCMPSDZrrib_Int	2120
+VCMPSDZrribk_Int	2121
+VCMPSDZrrik_Int	2122
+VCMPSDrmi	2123
+VCMPSDrmi_Int	2124
+VCMPSDrri	2125
+VCMPSDrri_Int	2126
+VCMPSHZrmi	2127
+VCMPSHZrmi_Int	2128
+VCMPSHZrmik_Int	2129
+VCMPSHZrri	2130
+VCMPSHZrri_Int	2131
+VCMPSHZrrib_Int	2132
+VCMPSHZrribk_Int	2133
+VCMPSHZrrik_Int	2134
+VCMPSSZrmi	2135
+VCMPSSZrmi_Int	2136
+VCMPSSZrmik_Int	2137
+VCMPSSZrri	2138
+VCMPSSZrri_Int	2139
+VCMPSSZrrib_Int	2140
+VCMPSSZrribk_Int	2141
+VCMPSSZrrik_Int	2142
+VCMPSSrmi	2143
+VCMPSSrmi_Int	2144
+VCMPSSrri	2145
+VCMPSSrri_Int	2146
+VCOMISBF	2147
+VCOMISDZrm	2148
+VCOMISDZrm_Int	2149
+VCOMISDZrr	2150
+VCOMISDZrr_Int	2151
+VCOMISDZrrb	2152
+VCOMISDrm	2153
+VCOMISDrm_Int	2154
+VCOMISDrr	2155
+VCOMISDrr_Int	2156
+VCOMISHZrm	2157
+VCOMISHZrm_Int	2158
+VCOMISHZrr	2159
+VCOMISHZrr_Int	2160
+VCOMISHZrrb	2161
+VCOMISSZrm	2162
+VCOMISSZrm_Int	2163
+VCOMISSZrr	2164
+VCOMISSZrr_Int	2165
+VCOMISSZrrb	2166
+VCOMISSrm	2167
+VCOMISSrm_Int	2168
+VCOMISSrr	2169
+VCOMISSrr_Int	2170
+VCOMPRESSPDZ	2171
+VCOMPRESSPDZmr	2172
+VCOMPRESSPDZmrk	2173
+VCOMPRESSPDZrr	2174
+VCOMPRESSPDZrrk	2175
+VCOMPRESSPDZrrkz	2176
+VCOMPRESSPSZ	2177
+VCOMPRESSPSZmr	2178
+VCOMPRESSPSZmrk	2179
+VCOMPRESSPSZrr	2180
+VCOMPRESSPSZrrk	2181
+VCOMPRESSPSZrrkz	2182
+VCOMXSDZrm_Int	2183
+VCOMXSDZrr_Int	2184
+VCOMXSDZrrb_Int	2185
+VCOMXSHZrm_Int	2186
+VCOMXSHZrr_Int	2187
+VCOMXSHZrrb_Int	2188
+VCOMXSSZrm_Int	2189
+VCOMXSSZrr_Int	2190
+VCOMXSSZrrb_Int	2191
+VCVT	2192
+VCVTBF	2193
+VCVTBIASPH	2194
+VCVTDQ	2195
+VCVTHF	2196
+VCVTNE	2197
+VCVTNEEBF	2198
+VCVTNEEPH	2199
+VCVTNEOBF	2200
+VCVTNEOPH	2201
+VCVTNEPS	2202
+VCVTPD	2203
+VCVTPH	2204
+VCVTPS	2205
+VCVTQQ	2206
+VCVTSD	2207
+VCVTSH	2208
+VCVTSI	2209
+VCVTSS	2210
+VCVTTBF	2211
+VCVTTPD	2212
+VCVTTPH	2213
+VCVTTPS	2214
+VCVTTSD	2215
+VCVTTSH	2216
+VCVTTSS	2217
+VCVTUDQ	2218
+VCVTUQQ	2219
+VCVTUSI	2220
+VCVTUW	2221
+VCVTW	2222
+VDBPSADBWZ	2223
+VDBPSADBWZrmi	2224
+VDBPSADBWZrmik	2225
+VDBPSADBWZrmikz	2226
+VDBPSADBWZrri	2227
+VDBPSADBWZrrik	2228
+VDBPSADBWZrrikz	2229
+VDIVBF	2230
+VDIVPDYrm	2231
+VDIVPDYrr	2232
+VDIVPDZ	2233
+VDIVPDZrm	2234
+VDIVPDZrmb	2235
+VDIVPDZrmbk	2236
+VDIVPDZrmbkz	2237
+VDIVPDZrmk	2238
+VDIVPDZrmkz	2239
+VDIVPDZrr	2240
+VDIVPDZrrb	2241
+VDIVPDZrrbk	2242
+VDIVPDZrrbkz	2243
+VDIVPDZrrk	2244
+VDIVPDZrrkz	2245
+VDIVPDrm	2246
+VDIVPDrr	2247
+VDIVPHZ	2248
+VDIVPHZrm	2249
+VDIVPHZrmb	2250
+VDIVPHZrmbk	2251
+VDIVPHZrmbkz	2252
+VDIVPHZrmk	2253
+VDIVPHZrmkz	2254
+VDIVPHZrr	2255
+VDIVPHZrrb	2256
+VDIVPHZrrbk	2257
+VDIVPHZrrbkz	2258
+VDIVPHZrrk	2259
+VDIVPHZrrkz	2260
+VDIVPSYrm	2261
+VDIVPSYrr	2262
+VDIVPSZ	2263
+VDIVPSZrm	2264
+VDIVPSZrmb	2265
+VDIVPSZrmbk	2266
+VDIVPSZrmbkz	2267
+VDIVPSZrmk	2268
+VDIVPSZrmkz	2269
+VDIVPSZrr	2270
+VDIVPSZrrb	2271
+VDIVPSZrrbk	2272
+VDIVPSZrrbkz	2273
+VDIVPSZrrk	2274
+VDIVPSZrrkz	2275
+VDIVPSrm	2276
+VDIVPSrr	2277
+VDIVSDZrm	2278
+VDIVSDZrm_Int	2279
+VDIVSDZrmk_Int	2280
+VDIVSDZrmkz_Int	2281
+VDIVSDZrr	2282
+VDIVSDZrr_Int	2283
+VDIVSDZrrb_Int	2284
+VDIVSDZrrbk_Int	2285
+VDIVSDZrrbkz_Int	2286
+VDIVSDZrrk_Int	2287
+VDIVSDZrrkz_Int	2288
+VDIVSDrm	2289
+VDIVSDrm_Int	2290
+VDIVSDrr	2291
+VDIVSDrr_Int	2292
+VDIVSHZrm	2293
+VDIVSHZrm_Int	2294
+VDIVSHZrmk_Int	2295
+VDIVSHZrmkz_Int	2296
+VDIVSHZrr	2297
+VDIVSHZrr_Int	2298
+VDIVSHZrrb_Int	2299
+VDIVSHZrrbk_Int	2300
+VDIVSHZrrbkz_Int	2301
+VDIVSHZrrk_Int	2302
+VDIVSHZrrkz_Int	2303
+VDIVSSZrm	2304
+VDIVSSZrm_Int	2305
+VDIVSSZrmk_Int	2306
+VDIVSSZrmkz_Int	2307
+VDIVSSZrr	2308
+VDIVSSZrr_Int	2309
+VDIVSSZrrb_Int	2310
+VDIVSSZrrbk_Int	2311
+VDIVSSZrrbkz_Int	2312
+VDIVSSZrrk_Int	2313
+VDIVSSZrrkz_Int	2314
+VDIVSSrm	2315
+VDIVSSrm_Int	2316
+VDIVSSrr	2317
+VDIVSSrr_Int	2318
+VDPBF	2319
+VDPPDrmi	2320
+VDPPDrri	2321
+VDPPHPSZ	2322
+VDPPHPSZm	2323
+VDPPHPSZmb	2324
+VDPPHPSZmbk	2325
+VDPPHPSZmbkz	2326
+VDPPHPSZmk	2327
+VDPPHPSZmkz	2328
+VDPPHPSZr	2329
+VDPPHPSZrk	2330
+VDPPHPSZrkz	2331
+VDPPSYrmi	2332
+VDPPSYrri	2333
+VDPPSrmi	2334
+VDPPSrri	2335
+VERRm	2336
+VERRr	2337
+VERWm	2338
+VERWr	2339
+VEXP	2340
+VEXPANDPDZ	2341
+VEXPANDPDZrm	2342
+VEXPANDPDZrmk	2343
+VEXPANDPDZrmkz	2344
+VEXPANDPDZrr	2345
+VEXPANDPDZrrk	2346
+VEXPANDPDZrrkz	2347
+VEXPANDPSZ	2348
+VEXPANDPSZrm	2349
+VEXPANDPSZrmk	2350
+VEXPANDPSZrmkz	2351
+VEXPANDPSZrr	2352
+VEXPANDPSZrrk	2353
+VEXPANDPSZrrkz	2354
+VEXTRACTF	2355
+VEXTRACTI	2356
+VEXTRACTPSZmri	2357
+VEXTRACTPSZrri	2358
+VEXTRACTPSmri	2359
+VEXTRACTPSrri	2360
+VFCMADDCPHZ	2361
+VFCMADDCPHZm	2362
+VFCMADDCPHZmb	2363
+VFCMADDCPHZmbk	2364
+VFCMADDCPHZmbkz	2365
+VFCMADDCPHZmk	2366
+VFCMADDCPHZmkz	2367
+VFCMADDCPHZr	2368
+VFCMADDCPHZrb	2369
+VFCMADDCPHZrbk	2370
+VFCMADDCPHZrbkz	2371
+VFCMADDCPHZrk	2372
+VFCMADDCPHZrkz	2373
+VFCMADDCSHZm	2374
+VFCMADDCSHZmk	2375
+VFCMADDCSHZmkz	2376
+VFCMADDCSHZr	2377
+VFCMADDCSHZrb	2378
+VFCMADDCSHZrbk	2379
+VFCMADDCSHZrbkz	2380
+VFCMADDCSHZrk	2381
+VFCMADDCSHZrkz	2382
+VFCMULCPHZ	2383
+VFCMULCPHZrm	2384
+VFCMULCPHZrmb	2385
+VFCMULCPHZrmbk	2386
+VFCMULCPHZrmbkz	2387
+VFCMULCPHZrmk	2388
+VFCMULCPHZrmkz	2389
+VFCMULCPHZrr	2390
+VFCMULCPHZrrb	2391
+VFCMULCPHZrrbk	2392
+VFCMULCPHZrrbkz	2393
+VFCMULCPHZrrk	2394
+VFCMULCPHZrrkz	2395
+VFCMULCSHZrm	2396
+VFCMULCSHZrmk	2397
+VFCMULCSHZrmkz	2398
+VFCMULCSHZrr	2399
+VFCMULCSHZrrb	2400
+VFCMULCSHZrrbk	2401
+VFCMULCSHZrrbkz	2402
+VFCMULCSHZrrk	2403
+VFCMULCSHZrrkz	2404
+VFIXUPIMMPDZ	2405
+VFIXUPIMMPDZrmbi	2406
+VFIXUPIMMPDZrmbik	2407
+VFIXUPIMMPDZrmbikz	2408
+VFIXUPIMMPDZrmi	2409
+VFIXUPIMMPDZrmik	2410
+VFIXUPIMMPDZrmikz	2411
+VFIXUPIMMPDZrri	2412
+VFIXUPIMMPDZrrib	2413
+VFIXUPIMMPDZrribk	2414
+VFIXUPIMMPDZrribkz	2415
+VFIXUPIMMPDZrrik	2416
+VFIXUPIMMPDZrrikz	2417
+VFIXUPIMMPSZ	2418
+VFIXUPIMMPSZrmbi	2419
+VFIXUPIMMPSZrmbik	2420
+VFIXUPIMMPSZrmbikz	2421
+VFIXUPIMMPSZrmi	2422
+VFIXUPIMMPSZrmik	2423
+VFIXUPIMMPSZrmikz	2424
+VFIXUPIMMPSZrri	2425
+VFIXUPIMMPSZrrib	2426
+VFIXUPIMMPSZrribk	2427
+VFIXUPIMMPSZrribkz	2428
+VFIXUPIMMPSZrrik	2429
+VFIXUPIMMPSZrrikz	2430
+VFIXUPIMMSDZrmi	2431
+VFIXUPIMMSDZrmik	2432
+VFIXUPIMMSDZrmikz	2433
+VFIXUPIMMSDZrri	2434
+VFIXUPIMMSDZrrib	2435
+VFIXUPIMMSDZrribk	2436
+VFIXUPIMMSDZrribkz	2437
+VFIXUPIMMSDZrrik	2438
+VFIXUPIMMSDZrrikz	2439
+VFIXUPIMMSSZrmi	2440
+VFIXUPIMMSSZrmik	2441
+VFIXUPIMMSSZrmikz	2442
+VFIXUPIMMSSZrri	2443
+VFIXUPIMMSSZrrib	2444
+VFIXUPIMMSSZrribk	2445
+VFIXUPIMMSSZrribkz	2446
+VFIXUPIMMSSZrrik	2447
+VFIXUPIMMSSZrrikz	2448
+VFMADD	2449
+VFMADDCPHZ	2450
+VFMADDCPHZm	2451
+VFMADDCPHZmb	2452
+VFMADDCPHZmbk	2453
+VFMADDCPHZmbkz	2454
+VFMADDCPHZmk	2455
+VFMADDCPHZmkz	2456
+VFMADDCPHZr	2457
+VFMADDCPHZrb	2458
+VFMADDCPHZrbk	2459
+VFMADDCPHZrbkz	2460
+VFMADDCPHZrk	2461
+VFMADDCPHZrkz	2462
+VFMADDCSHZm	2463
+VFMADDCSHZmk	2464
+VFMADDCSHZmkz	2465
+VFMADDCSHZr	2466
+VFMADDCSHZrb	2467
+VFMADDCSHZrbk	2468
+VFMADDCSHZrbkz	2469
+VFMADDCSHZrk	2470
+VFMADDCSHZrkz	2471
+VFMADDPD	2472
+VFMADDPS	2473
+VFMADDSD	2474
+VFMADDSS	2475
+VFMADDSUB	2476
+VFMADDSUBPD	2477
+VFMADDSUBPS	2478
+VFMSUB	2479
+VFMSUBADD	2480
+VFMSUBADDPD	2481
+VFMSUBADDPS	2482
+VFMSUBPD	2483
+VFMSUBPS	2484
+VFMSUBSD	2485
+VFMSUBSS	2486
+VFMULCPHZ	2487
+VFMULCPHZrm	2488
+VFMULCPHZrmb	2489
+VFMULCPHZrmbk	2490
+VFMULCPHZrmbkz	2491
+VFMULCPHZrmk	2492
+VFMULCPHZrmkz	2493
+VFMULCPHZrr	2494
+VFMULCPHZrrb	2495
+VFMULCPHZrrbk	2496
+VFMULCPHZrrbkz	2497
+VFMULCPHZrrk	2498
+VFMULCPHZrrkz	2499
+VFMULCSHZrm	2500
+VFMULCSHZrmk	2501
+VFMULCSHZrmkz	2502
+VFMULCSHZrr	2503
+VFMULCSHZrrb	2504
+VFMULCSHZrrbk	2505
+VFMULCSHZrrbkz	2506
+VFMULCSHZrrk	2507
+VFMULCSHZrrkz	2508
+VFNMADD	2509
+VFNMADDPD	2510
+VFNMADDPS	2511
+VFNMADDSD	2512
+VFNMADDSS	2513
+VFNMSUB	2514
+VFNMSUBPD	2515
+VFNMSUBPS	2516
+VFNMSUBSD	2517
+VFNMSUBSS	2518
+VFPCLASSBF	2519
+VFPCLASSPDZ	2520
+VFPCLASSPDZmbi	2521
+VFPCLASSPDZmbik	2522
+VFPCLASSPDZmi	2523
+VFPCLASSPDZmik	2524
+VFPCLASSPDZri	2525
+VFPCLASSPDZrik	2526
+VFPCLASSPHZ	2527
+VFPCLASSPHZmbi	2528
+VFPCLASSPHZmbik	2529
+VFPCLASSPHZmi	2530
+VFPCLASSPHZmik	2531
+VFPCLASSPHZri	2532
+VFPCLASSPHZrik	2533
+VFPCLASSPSZ	2534
+VFPCLASSPSZmbi	2535
+VFPCLASSPSZmbik	2536
+VFPCLASSPSZmi	2537
+VFPCLASSPSZmik	2538
+VFPCLASSPSZri	2539
+VFPCLASSPSZrik	2540
+VFPCLASSSDZmi	2541
+VFPCLASSSDZmik	2542
+VFPCLASSSDZri	2543
+VFPCLASSSDZrik	2544
+VFPCLASSSHZmi	2545
+VFPCLASSSHZmik	2546
+VFPCLASSSHZri	2547
+VFPCLASSSHZrik	2548
+VFPCLASSSSZmi	2549
+VFPCLASSSSZmik	2550
+VFPCLASSSSZri	2551
+VFPCLASSSSZrik	2552
+VFRCZPDYrm	2553
+VFRCZPDYrr	2554
+VFRCZPDrm	2555
+VFRCZPDrr	2556
+VFRCZPSYrm	2557
+VFRCZPSYrr	2558
+VFRCZPSrm	2559
+VFRCZPSrr	2560
+VFRCZSDrm	2561
+VFRCZSDrr	2562
+VFRCZSSrm	2563
+VFRCZSSrr	2564
+VGATHERDPDYrm	2565
+VGATHERDPDZ	2566
+VGATHERDPDZrm	2567
+VGATHERDPDrm	2568
+VGATHERDPSYrm	2569
+VGATHERDPSZ	2570
+VGATHERDPSZrm	2571
+VGATHERDPSrm	2572
+VGATHERPF	2573
+VGATHERQPDYrm	2574
+VGATHERQPDZ	2575
+VGATHERQPDZrm	2576
+VGATHERQPDrm	2577
+VGATHERQPSYrm	2578
+VGATHERQPSZ	2579
+VGATHERQPSZrm	2580
+VGATHERQPSrm	2581
+VGETEXPBF	2582
+VGETEXPPDZ	2583
+VGETEXPPDZm	2584
+VGETEXPPDZmb	2585
+VGETEXPPDZmbk	2586
+VGETEXPPDZmbkz	2587
+VGETEXPPDZmk	2588
+VGETEXPPDZmkz	2589
+VGETEXPPDZr	2590
+VGETEXPPDZrb	2591
+VGETEXPPDZrbk	2592
+VGETEXPPDZrbkz	2593
+VGETEXPPDZrk	2594
+VGETEXPPDZrkz	2595
+VGETEXPPHZ	2596
+VGETEXPPHZm	2597
+VGETEXPPHZmb	2598
+VGETEXPPHZmbk	2599
+VGETEXPPHZmbkz	2600
+VGETEXPPHZmk	2601
+VGETEXPPHZmkz	2602
+VGETEXPPHZr	2603
+VGETEXPPHZrb	2604
+VGETEXPPHZrbk	2605
+VGETEXPPHZrbkz	2606
+VGETEXPPHZrk	2607
+VGETEXPPHZrkz	2608
+VGETEXPPSZ	2609
+VGETEXPPSZm	2610
+VGETEXPPSZmb	2611
+VGETEXPPSZmbk	2612
+VGETEXPPSZmbkz	2613
+VGETEXPPSZmk	2614
+VGETEXPPSZmkz	2615
+VGETEXPPSZr	2616
+VGETEXPPSZrb	2617
+VGETEXPPSZrbk	2618
+VGETEXPPSZrbkz	2619
+VGETEXPPSZrk	2620
+VGETEXPPSZrkz	2621
+VGETEXPSDZm	2622
+VGETEXPSDZmk	2623
+VGETEXPSDZmkz	2624
+VGETEXPSDZr	2625
+VGETEXPSDZrb	2626
+VGETEXPSDZrbk	2627
+VGETEXPSDZrbkz	2628
+VGETEXPSDZrk	2629
+VGETEXPSDZrkz	2630
+VGETEXPSHZm	2631
+VGETEXPSHZmk	2632
+VGETEXPSHZmkz	2633
+VGETEXPSHZr	2634
+VGETEXPSHZrb	2635
+VGETEXPSHZrbk	2636
+VGETEXPSHZrbkz	2637
+VGETEXPSHZrk	2638
+VGETEXPSHZrkz	2639
+VGETEXPSSZm	2640
+VGETEXPSSZmk	2641
+VGETEXPSSZmkz	2642
+VGETEXPSSZr	2643
+VGETEXPSSZrb	2644
+VGETEXPSSZrbk	2645
+VGETEXPSSZrbkz	2646
+VGETEXPSSZrk	2647
+VGETEXPSSZrkz	2648
+VGETMANTBF	2649
+VGETMANTPDZ	2650
+VGETMANTPDZrmbi	2651
+VGETMANTPDZrmbik	2652
+VGETMANTPDZrmbikz	2653
+VGETMANTPDZrmi	2654
+VGETMANTPDZrmik	2655
+VGETMANTPDZrmikz	2656
+VGETMANTPDZrri	2657
+VGETMANTPDZrrib	2658
+VGETMANTPDZrribk	2659
+VGETMANTPDZrribkz	2660
+VGETMANTPDZrrik	2661
+VGETMANTPDZrrikz	2662
+VGETMANTPHZ	2663
+VGETMANTPHZrmbi	2664
+VGETMANTPHZrmbik	2665
+VGETMANTPHZrmbikz	2666
+VGETMANTPHZrmi	2667
+VGETMANTPHZrmik	2668
+VGETMANTPHZrmikz	2669
+VGETMANTPHZrri	2670
+VGETMANTPHZrrib	2671
+VGETMANTPHZrribk	2672
+VGETMANTPHZrribkz	2673
+VGETMANTPHZrrik	2674
+VGETMANTPHZrrikz	2675
+VGETMANTPSZ	2676
+VGETMANTPSZrmbi	2677
+VGETMANTPSZrmbik	2678
+VGETMANTPSZrmbikz	2679
+VGETMANTPSZrmi	2680
+VGETMANTPSZrmik	2681
+VGETMANTPSZrmikz	2682
+VGETMANTPSZrri	2683
+VGETMANTPSZrrib	2684
+VGETMANTPSZrribk	2685
+VGETMANTPSZrribkz	2686
+VGETMANTPSZrrik	2687
+VGETMANTPSZrrikz	2688
+VGETMANTSDZrmi	2689
+VGETMANTSDZrmik	2690
+VGETMANTSDZrmikz	2691
+VGETMANTSDZrri	2692
+VGETMANTSDZrrib	2693
+VGETMANTSDZrribk	2694
+VGETMANTSDZrribkz	2695
+VGETMANTSDZrrik	2696
+VGETMANTSDZrrikz	2697
+VGETMANTSHZrmi	2698
+VGETMANTSHZrmik	2699
+VGETMANTSHZrmikz	2700
+VGETMANTSHZrri	2701
+VGETMANTSHZrrib	2702
+VGETMANTSHZrribk	2703
+VGETMANTSHZrribkz	2704
+VGETMANTSHZrrik	2705
+VGETMANTSHZrrikz	2706
+VGETMANTSSZrmi	2707
+VGETMANTSSZrmik	2708
+VGETMANTSSZrmikz	2709
+VGETMANTSSZrri	2710
+VGETMANTSSZrrib	2711
+VGETMANTSSZrribk	2712
+VGETMANTSSZrribkz	2713
+VGETMANTSSZrrik	2714
+VGETMANTSSZrrikz	2715
+VGF	2716
+VHADDPDYrm	2717
+VHADDPDYrr	2718
+VHADDPDrm	2719
+VHADDPDrr	2720
+VHADDPSYrm	2721
+VHADDPSYrr	2722
+VHADDPSrm	2723
+VHADDPSrr	2724
+VHSUBPDYrm	2725
+VHSUBPDYrr	2726
+VHSUBPDrm	2727
+VHSUBPDrr	2728
+VHSUBPSYrm	2729
+VHSUBPSYrr	2730
+VHSUBPSrm	2731
+VHSUBPSrr	2732
+VINSERTF	2733
+VINSERTI	2734
+VINSERTPSZrmi	2735
+VINSERTPSZrri	2736
+VINSERTPSrmi	2737
+VINSERTPSrri	2738
+VLDDQUYrm	2739
+VLDDQUrm	2740
+VLDMXCSR	2741
+VMASKMOVDQU	2742
+VMASKMOVPDYmr	2743
+VMASKMOVPDYrm	2744
+VMASKMOVPDmr	2745
+VMASKMOVPDrm	2746
+VMASKMOVPSYmr	2747
+VMASKMOVPSYrm	2748
+VMASKMOVPSmr	2749
+VMASKMOVPSrm	2750
+VMAXBF	2751
+VMAXCPDYrm	2752
+VMAXCPDYrr	2753
+VMAXCPDZ	2754
+VMAXCPDZrm	2755
+VMAXCPDZrmb	2756
+VMAXCPDZrmbk	2757
+VMAXCPDZrmbkz	2758
+VMAXCPDZrmk	2759
+VMAXCPDZrmkz	2760
+VMAXCPDZrr	2761
+VMAXCPDZrrk	2762
+VMAXCPDZrrkz	2763
+VMAXCPDrm	2764
+VMAXCPDrr	2765
+VMAXCPHZ	2766
+VMAXCPHZrm	2767
+VMAXCPHZrmb	2768
+VMAXCPHZrmbk	2769
+VMAXCPHZrmbkz	2770
+VMAXCPHZrmk	2771
+VMAXCPHZrmkz	2772
+VMAXCPHZrr	2773
+VMAXCPHZrrk	2774
+VMAXCPHZrrkz	2775
+VMAXCPSYrm	2776
+VMAXCPSYrr	2777
+VMAXCPSZ	2778
+VMAXCPSZrm	2779
+VMAXCPSZrmb	2780
+VMAXCPSZrmbk	2781
+VMAXCPSZrmbkz	2782
+VMAXCPSZrmk	2783
+VMAXCPSZrmkz	2784
+VMAXCPSZrr	2785
+VMAXCPSZrrk	2786
+VMAXCPSZrrkz	2787
+VMAXCPSrm	2788
+VMAXCPSrr	2789
+VMAXCSDZrm	2790
+VMAXCSDZrr	2791
+VMAXCSDrm	2792
+VMAXCSDrr	2793
+VMAXCSHZrm	2794
+VMAXCSHZrr	2795
+VMAXCSSZrm	2796
+VMAXCSSZrr	2797
+VMAXCSSrm	2798
+VMAXCSSrr	2799
+VMAXPDYrm	2800
+VMAXPDYrr	2801
+VMAXPDZ	2802
+VMAXPDZrm	2803
+VMAXPDZrmb	2804
+VMAXPDZrmbk	2805
+VMAXPDZrmbkz	2806
+VMAXPDZrmk	2807
+VMAXPDZrmkz	2808
+VMAXPDZrr	2809
+VMAXPDZrrb	2810
+VMAXPDZrrbk	2811
+VMAXPDZrrbkz	2812
+VMAXPDZrrk	2813
+VMAXPDZrrkz	2814
+VMAXPDrm	2815
+VMAXPDrr	2816
+VMAXPHZ	2817
+VMAXPHZrm	2818
+VMAXPHZrmb	2819
+VMAXPHZrmbk	2820
+VMAXPHZrmbkz	2821
+VMAXPHZrmk	2822
+VMAXPHZrmkz	2823
+VMAXPHZrr	2824
+VMAXPHZrrb	2825
+VMAXPHZrrbk	2826
+VMAXPHZrrbkz	2827
+VMAXPHZrrk	2828
+VMAXPHZrrkz	2829
+VMAXPSYrm	2830
+VMAXPSYrr	2831
+VMAXPSZ	2832
+VMAXPSZrm	2833
+VMAXPSZrmb	2834
+VMAXPSZrmbk	2835
+VMAXPSZrmbkz	2836
+VMAXPSZrmk	2837
+VMAXPSZrmkz	2838
+VMAXPSZrr	2839
+VMAXPSZrrb	2840
+VMAXPSZrrbk	2841
+VMAXPSZrrbkz	2842
+VMAXPSZrrk	2843
+VMAXPSZrrkz	2844
+VMAXPSrm	2845
+VMAXPSrr	2846
+VMAXSDZrm	2847
+VMAXSDZrm_Int	2848
+VMAXSDZrmk_Int	2849
+VMAXSDZrmkz_Int	2850
+VMAXSDZrr	2851
+VMAXSDZrr_Int	2852
+VMAXSDZrrb_Int	2853
+VMAXSDZrrbk_Int	2854
+VMAXSDZrrbkz_Int	2855
+VMAXSDZrrk_Int	2856
+VMAXSDZrrkz_Int	2857
+VMAXSDrm	2858
+VMAXSDrm_Int	2859
+VMAXSDrr	2860
+VMAXSDrr_Int	2861
+VMAXSHZrm	2862
+VMAXSHZrm_Int	2863
+VMAXSHZrmk_Int	2864
+VMAXSHZrmkz_Int	2865
+VMAXSHZrr	2866
+VMAXSHZrr_Int	2867
+VMAXSHZrrb_Int	2868
+VMAXSHZrrbk_Int	2869
+VMAXSHZrrbkz_Int	2870
+VMAXSHZrrk_Int	2871
+VMAXSHZrrkz_Int	2872
+VMAXSSZrm	2873
+VMAXSSZrm_Int	2874
+VMAXSSZrmk_Int	2875
+VMAXSSZrmkz_Int	2876
+VMAXSSZrr	2877
+VMAXSSZrr_Int	2878
+VMAXSSZrrb_Int	2879
+VMAXSSZrrbk_Int	2880
+VMAXSSZrrbkz_Int	2881
+VMAXSSZrrk_Int	2882
+VMAXSSZrrkz_Int	2883
+VMAXSSrm	2884
+VMAXSSrm_Int	2885
+VMAXSSrr	2886
+VMAXSSrr_Int	2887
+VMCALL	2888
+VMCLEARm	2889
+VMFUNC	2890
+VMINBF	2891
+VMINCPDYrm	2892
+VMINCPDYrr	2893
+VMINCPDZ	2894
+VMINCPDZrm	2895
+VMINCPDZrmb	2896
+VMINCPDZrmbk	2897
+VMINCPDZrmbkz	2898
+VMINCPDZrmk	2899
+VMINCPDZrmkz	2900
+VMINCPDZrr	2901
+VMINCPDZrrk	2902
+VMINCPDZrrkz	2903
+VMINCPDrm	2904
+VMINCPDrr	2905
+VMINCPHZ	2906
+VMINCPHZrm	2907
+VMINCPHZrmb	2908
+VMINCPHZrmbk	2909
+VMINCPHZrmbkz	2910
+VMINCPHZrmk	2911
+VMINCPHZrmkz	2912
+VMINCPHZrr	2913
+VMINCPHZrrk	2914
+VMINCPHZrrkz	2915
+VMINCPSYrm	2916
+VMINCPSYrr	2917
+VMINCPSZ	2918
+VMINCPSZrm	2919
+VMINCPSZrmb	2920
+VMINCPSZrmbk	2921
+VMINCPSZrmbkz	2922
+VMINCPSZrmk	2923
+VMINCPSZrmkz	2924
+VMINCPSZrr	2925
+VMINCPSZrrk	2926
+VMINCPSZrrkz	2927
+VMINCPSrm	2928
+VMINCPSrr	2929
+VMINCSDZrm	2930
+VMINCSDZrr	2931
+VMINCSDrm	2932
+VMINCSDrr	2933
+VMINCSHZrm	2934
+VMINCSHZrr	2935
+VMINCSSZrm	2936
+VMINCSSZrr	2937
+VMINCSSrm	2938
+VMINCSSrr	2939
+VMINMAXBF	2940
+VMINMAXPDZ	2941
+VMINMAXPDZrmbi	2942
+VMINMAXPDZrmbik	2943
+VMINMAXPDZrmbikz	2944
+VMINMAXPDZrmi	2945
+VMINMAXPDZrmik	2946
+VMINMAXPDZrmikz	2947
+VMINMAXPDZrri	2948
+VMINMAXPDZrrib	2949
+VMINMAXPDZrribk	2950
+VMINMAXPDZrribkz	2951
+VMINMAXPDZrrik	2952
+VMINMAXPDZrrikz	2953
+VMINMAXPHZ	2954
+VMINMAXPHZrmbi	2955
+VMINMAXPHZrmbik	2956
+VMINMAXPHZrmbikz	2957
+VMINMAXPHZrmi	2958
+VMINMAXPHZrmik	2959
+VMINMAXPHZrmikz	2960
+VMINMAXPHZrri	2961
+VMINMAXPHZrrib	2962
+VMINMAXPHZrribk	2963
+VMINMAXPHZrribkz	2964
+VMINMAXPHZrrik	2965
+VMINMAXPHZrrikz	2966
+VMINMAXPSZ	2967
+VMINMAXPSZrmbi	2968
+VMINMAXPSZrmbik	2969
+VMINMAXPSZrmbikz	2970
+VMINMAXPSZrmi	2971
+VMINMAXPSZrmik	2972
+VMINMAXPSZrmikz	2973
+VMINMAXPSZrri	2974
+VMINMAXPSZrrib	2975
+VMINMAXPSZrribk	2976
+VMINMAXPSZrribkz	2977
+VMINMAXPSZrrik	2978
+VMINMAXPSZrrikz	2979
+VMINMAXSDrmi	2980
+VMINMAXSDrmi_Int	2981
+VMINMAXSDrmik_Int	2982
+VMINMAXSDrmikz_Int	2983
+VMINMAXSDrri	2984
+VMINMAXSDrri_Int	2985
+VMINMAXSDrrib_Int	2986
+VMINMAXSDrribk_Int	2987
+VMINMAXSDrribkz_Int	2988
+VMINMAXSDrrik_Int	2989
+VMINMAXSDrrikz_Int	2990
+VMINMAXSHrmi	2991
+VMINMAXSHrmi_Int	2992
+VMINMAXSHrmik_Int	2993
+VMINMAXSHrmikz_Int	2994
+VMINMAXSHrri	2995
+VMINMAXSHrri_Int	2996
+VMINMAXSHrrib_Int	2997
+VMINMAXSHrribk_Int	2998
+VMINMAXSHrribkz_Int	2999
+VMINMAXSHrrik_Int	3000
+VMINMAXSHrrikz_Int	3001
+VMINMAXSSrmi	3002
+VMINMAXSSrmi_Int	3003
+VMINMAXSSrmik_Int	3004
+VMINMAXSSrmikz_Int	3005
+VMINMAXSSrri	3006
+VMINMAXSSrri_Int	3007
+VMINMAXSSrrib_Int	3008
+VMINMAXSSrribk_Int	3009
+VMINMAXSSrribkz_Int	3010
+VMINMAXSSrrik_Int	3011
+VMINMAXSSrrikz_Int	3012
+VMINPDYrm	3013
+VMINPDYrr	3014
+VMINPDZ	3015
+VMINPDZrm	3016
+VMINPDZrmb	3017
+VMINPDZrmbk	3018
+VMINPDZrmbkz	3019
+VMINPDZrmk	3020
+VMINPDZrmkz	3021
+VMINPDZrr	3022
+VMINPDZrrb	3023
+VMINPDZrrbk	3024
+VMINPDZrrbkz	3025
+VMINPDZrrk	3026
+VMINPDZrrkz	3027
+VMINPDrm	3028
+VMINPDrr	3029
+VMINPHZ	3030
+VMINPHZrm	3031
+VMINPHZrmb	3032
+VMINPHZrmbk	3033
+VMINPHZrmbkz	3034
+VMINPHZrmk	3035
+VMINPHZrmkz	3036
+VMINPHZrr	3037
+VMINPHZrrb	3038
+VMINPHZrrbk	3039
+VMINPHZrrbkz	3040
+VMINPHZrrk	3041
+VMINPHZrrkz	3042
+VMINPSYrm	3043
+VMINPSYrr	3044
+VMINPSZ	3045
+VMINPSZrm	3046
+VMINPSZrmb	3047
+VMINPSZrmbk	3048
+VMINPSZrmbkz	3049
+VMINPSZrmk	3050
+VMINPSZrmkz	3051
+VMINPSZrr	3052
+VMINPSZrrb	3053
+VMINPSZrrbk	3054
+VMINPSZrrbkz	3055
+VMINPSZrrk	3056
+VMINPSZrrkz	3057
+VMINPSrm	3058
+VMINPSrr	3059
+VMINSDZrm	3060
+VMINSDZrm_Int	3061
+VMINSDZrmk_Int	3062
+VMINSDZrmkz_Int	3063
+VMINSDZrr	3064
+VMINSDZrr_Int	3065
+VMINSDZrrb_Int	3066
+VMINSDZrrbk_Int	3067
+VMINSDZrrbkz_Int	3068
+VMINSDZrrk_Int	3069
+VMINSDZrrkz_Int	3070
+VMINSDrm	3071
+VMINSDrm_Int	3072
+VMINSDrr	3073
+VMINSDrr_Int	3074
+VMINSHZrm	3075
+VMINSHZrm_Int	3076
+VMINSHZrmk_Int	3077
+VMINSHZrmkz_Int	3078
+VMINSHZrr	3079
+VMINSHZrr_Int	3080
+VMINSHZrrb_Int	3081
+VMINSHZrrbk_Int	3082
+VMINSHZrrbkz_Int	3083
+VMINSHZrrk_Int	3084
+VMINSHZrrkz_Int	3085
+VMINSSZrm	3086
+VMINSSZrm_Int	3087
+VMINSSZrmk_Int	3088
+VMINSSZrmkz_Int	3089
+VMINSSZrr	3090
+VMINSSZrr_Int	3091
+VMINSSZrrb_Int	3092
+VMINSSZrrbk_Int	3093
+VMINSSZrrbkz_Int	3094
+VMINSSZrrk_Int	3095
+VMINSSZrrkz_Int	3096
+VMINSSrm	3097
+VMINSSrm_Int	3098
+VMINSSrr	3099
+VMINSSrr_Int	3100
+VMLAUNCH	3101
+VMLOAD	3102
+VMMCALL	3103
+VMOV	3104
+VMOVAPDYmr	3105
+VMOVAPDYrm	3106
+VMOVAPDYrr	3107
+VMOVAPDYrr_REV	3108
+VMOVAPDZ	3109
+VMOVAPDZmr	3110
+VMOVAPDZmrk	3111
+VMOVAPDZrm	3112
+VMOVAPDZrmk	3113
+VMOVAPDZrmkz	3114
+VMOVAPDZrr	3115
+VMOVAPDZrr_REV	3116
+VMOVAPDZrrk	3117
+VMOVAPDZrrk_REV	3118
+VMOVAPDZrrkz	3119
+VMOVAPDZrrkz_REV	3120
+VMOVAPDmr	3121
+VMOVAPDrm	3122
+VMOVAPDrr	3123
+VMOVAPDrr_REV	3124
+VMOVAPSYmr	3125
+VMOVAPSYrm	3126
+VMOVAPSYrr	3127
+VMOVAPSYrr_REV	3128
+VMOVAPSZ	3129
+VMOVAPSZmr	3130
+VMOVAPSZmrk	3131
+VMOVAPSZrm	3132
+VMOVAPSZrmk	3133
+VMOVAPSZrmkz	3134
+VMOVAPSZrr	3135
+VMOVAPSZrr_REV	3136
+VMOVAPSZrrk	3137
+VMOVAPSZrrk_REV	3138
+VMOVAPSZrrkz	3139
+VMOVAPSZrrkz_REV	3140
+VMOVAPSmr	3141
+VMOVAPSrm	3142
+VMOVAPSrr	3143
+VMOVAPSrr_REV	3144
+VMOVDDUPYrm	3145
+VMOVDDUPYrr	3146
+VMOVDDUPZ	3147
+VMOVDDUPZrm	3148
+VMOVDDUPZrmk	3149
+VMOVDDUPZrmkz	3150
+VMOVDDUPZrr	3151
+VMOVDDUPZrrk	3152
+VMOVDDUPZrrkz	3153
+VMOVDDUPrm	3154
+VMOVDDUPrr	3155
+VMOVDI	3156
+VMOVDQA	3157
+VMOVDQAYmr	3158
+VMOVDQAYrm	3159
+VMOVDQAYrr	3160
+VMOVDQAYrr_REV	3161
+VMOVDQAmr	3162
+VMOVDQArm	3163
+VMOVDQArr	3164
+VMOVDQArr_REV	3165
+VMOVDQU	3166
+VMOVDQUYmr	3167
+VMOVDQUYrm	3168
+VMOVDQUYrr	3169
+VMOVDQUYrr_REV	3170
+VMOVDQUmr	3171
+VMOVDQUrm	3172
+VMOVDQUrr	3173
+VMOVDQUrr_REV	3174
+VMOVHLPSZrr	3175
+VMOVHLPSrr	3176
+VMOVHPDZ	3177
+VMOVHPDmr	3178
+VMOVHPDrm	3179
+VMOVHPSZ	3180
+VMOVHPSmr	3181
+VMOVHPSrm	3182
+VMOVLHPSZrr	3183
+VMOVLHPSrr	3184
+VMOVLPDZ	3185
+VMOVLPDmr	3186
+VMOVLPDrm	3187
+VMOVLPSZ	3188
+VMOVLPSmr	3189
+VMOVLPSrm	3190
+VMOVMSKPDYrr	3191
+VMOVMSKPDrr	3192
+VMOVMSKPSYrr	3193
+VMOVMSKPSrr	3194
+VMOVNTDQAYrm	3195
+VMOVNTDQAZ	3196
+VMOVNTDQAZrm	3197
+VMOVNTDQArm	3198
+VMOVNTDQYmr	3199
+VMOVNTDQZ	3200
+VMOVNTDQZmr	3201
+VMOVNTDQmr	3202
+VMOVNTPDYmr	3203
+VMOVNTPDZ	3204
+VMOVNTPDZmr	3205
+VMOVNTPDmr	3206
+VMOVNTPSYmr	3207
+VMOVNTPSZ	3208
+VMOVNTPSZmr	3209
+VMOVNTPSmr	3210
+VMOVPDI	3211
+VMOVPQI	3212
+VMOVPQIto	3213
+VMOVQI	3214
+VMOVRSBZ	3215
+VMOVRSBZm	3216
+VMOVRSBZmk	3217
+VMOVRSBZmkz	3218
+VMOVRSDZ	3219
+VMOVRSDZm	3220
+VMOVRSDZmk	3221
+VMOVRSDZmkz	3222
+VMOVRSQZ	3223
+VMOVRSQZm	3224
+VMOVRSQZmk	3225
+VMOVRSQZmkz	3226
+VMOVRSWZ	3227
+VMOVRSWZm	3228
+VMOVRSWZmk	3229
+VMOVRSWZmkz	3230
+VMOVSDZmr	3231
+VMOVSDZmrk	3232
+VMOVSDZrm	3233
+VMOVSDZrm_alt	3234
+VMOVSDZrmk	3235
+VMOVSDZrmkz	3236
+VMOVSDZrr	3237
+VMOVSDZrr_REV	3238
+VMOVSDZrrk	3239
+VMOVSDZrrk_REV	3240
+VMOVSDZrrkz	3241
+VMOVSDZrrkz_REV	3242
+VMOVSDmr	3243
+VMOVSDrm	3244
+VMOVSDrm_alt	3245
+VMOVSDrr	3246
+VMOVSDrr_REV	3247
+VMOVSDto	3248
+VMOVSH	3249
+VMOVSHDUPYrm	3250
+VMOVSHDUPYrr	3251
+VMOVSHDUPZ	3252
+VMOVSHDUPZrm	3253
+VMOVSHDUPZrmk	3254
+VMOVSHDUPZrmkz	3255
+VMOVSHDUPZrr	3256
+VMOVSHDUPZrrk	3257
+VMOVSHDUPZrrkz	3258
+VMOVSHDUPrm	3259
+VMOVSHDUPrr	3260
+VMOVSHZmr	3261
+VMOVSHZmrk	3262
+VMOVSHZrm	3263
+VMOVSHZrm_alt	3264
+VMOVSHZrmk	3265
+VMOVSHZrmkz	3266
+VMOVSHZrr	3267
+VMOVSHZrr_REV	3268
+VMOVSHZrrk	3269
+VMOVSHZrrk_REV	3270
+VMOVSHZrrkz	3271
+VMOVSHZrrkz_REV	3272
+VMOVSHtoW	3273
+VMOVSLDUPYrm	3274
+VMOVSLDUPYrr	3275
+VMOVSLDUPZ	3276
+VMOVSLDUPZrm	3277
+VMOVSLDUPZrmk	3278
+VMOVSLDUPZrmkz	3279
+VMOVSLDUPZrr	3280
+VMOVSLDUPZrrk	3281
+VMOVSLDUPZrrkz	3282
+VMOVSLDUPrm	3283
+VMOVSLDUPrr	3284
+VMOVSS	3285
+VMOVSSZmr	3286
+VMOVSSZmrk	3287
+VMOVSSZrm	3288
+VMOVSSZrm_alt	3289
+VMOVSSZrmk	3290
+VMOVSSZrmkz	3291
+VMOVSSZrr	3292
+VMOVSSZrr_REV	3293
+VMOVSSZrrk	3294
+VMOVSSZrrk_REV	3295
+VMOVSSZrrkz	3296
+VMOVSSZrrkz_REV	3297
+VMOVSSmr	3298
+VMOVSSrm	3299
+VMOVSSrm_alt	3300
+VMOVSSrr	3301
+VMOVSSrr_REV	3302
+VMOVUPDYmr	3303
+VMOVUPDYrm	3304
+VMOVUPDYrr	3305
+VMOVUPDYrr_REV	3306
+VMOVUPDZ	3307
+VMOVUPDZmr	3308
+VMOVUPDZmrk	3309
+VMOVUPDZrm	3310
+VMOVUPDZrmk	3311
+VMOVUPDZrmkz	3312
+VMOVUPDZrr	3313
+VMOVUPDZrr_REV	3314
+VMOVUPDZrrk	3315
+VMOVUPDZrrk_REV	3316
+VMOVUPDZrrkz	3317
+VMOVUPDZrrkz_REV	3318
+VMOVUPDmr	3319
+VMOVUPDrm	3320
+VMOVUPDrr	3321
+VMOVUPDrr_REV	3322
+VMOVUPSYmr	3323
+VMOVUPSYrm	3324
+VMOVUPSYrr	3325
+VMOVUPSYrr_REV	3326
+VMOVUPSZ	3327
+VMOVUPSZmr	3328
+VMOVUPSZmrk	3329
+VMOVUPSZrm	3330
+VMOVUPSZrmk	3331
+VMOVUPSZrmkz	3332
+VMOVUPSZrr	3333
+VMOVUPSZrr_REV	3334
+VMOVUPSZrrk	3335
+VMOVUPSZrrk_REV	3336
+VMOVUPSZrrkz	3337
+VMOVUPSZrrkz_REV	3338
+VMOVUPSmr	3339
+VMOVUPSrm	3340
+VMOVUPSrr	3341
+VMOVUPSrr_REV	3342
+VMOVW	3343
+VMOVWmr	3344
+VMOVWrm	3345
+VMOVZPDILo	3346
+VMOVZPQILo	3347
+VMOVZPWILo	3348
+VMPSADBWYrmi	3349
+VMPSADBWYrri	3350
+VMPSADBWZ	3351
+VMPSADBWZrmi	3352
+VMPSADBWZrmik	3353
+VMPSADBWZrmikz	3354
+VMPSADBWZrri	3355
+VMPSADBWZrrik	3356
+VMPSADBWZrrikz	3357
+VMPSADBWrmi	3358
+VMPSADBWrri	3359
+VMPTRLDm	3360
+VMPTRSTm	3361
+VMREAD	3362
+VMRESUME	3363
+VMRUN	3364
+VMSAVE	3365
+VMULBF	3366
+VMULPDYrm	3367
+VMULPDYrr	3368
+VMULPDZ	3369
+VMULPDZrm	3370
+VMULPDZrmb	3371
+VMULPDZrmbk	3372
+VMULPDZrmbkz	3373
+VMULPDZrmk	3374
+VMULPDZrmkz	3375
+VMULPDZrr	3376
+VMULPDZrrb	3377
+VMULPDZrrbk	3378
+VMULPDZrrbkz	3379
+VMULPDZrrk	3380
+VMULPDZrrkz	3381
+VMULPDrm	3382
+VMULPDrr	3383
+VMULPHZ	3384
+VMULPHZrm	3385
+VMULPHZrmb	3386
+VMULPHZrmbk	3387
+VMULPHZrmbkz	3388
+VMULPHZrmk	3389
+VMULPHZrmkz	3390
+VMULPHZrr	3391
+VMULPHZrrb	3392
+VMULPHZrrbk	3393
+VMULPHZrrbkz	3394
+VMULPHZrrk	3395
+VMULPHZrrkz	3396
+VMULPSYrm	3397
+VMULPSYrr	3398
+VMULPSZ	3399
+VMULPSZrm	3400
+VMULPSZrmb	3401
+VMULPSZrmbk	3402
+VMULPSZrmbkz	3403
+VMULPSZrmk	3404
+VMULPSZrmkz	3405
+VMULPSZrr	3406
+VMULPSZrrb	3407
+VMULPSZrrbk	3408
+VMULPSZrrbkz	3409
+VMULPSZrrk	3410
+VMULPSZrrkz	3411
+VMULPSrm	3412
+VMULPSrr	3413
+VMULSDZrm	3414
+VMULSDZrm_Int	3415
+VMULSDZrmk_Int	3416
+VMULSDZrmkz_Int	3417
+VMULSDZrr	3418
+VMULSDZrr_Int	3419
+VMULSDZrrb_Int	3420
+VMULSDZrrbk_Int	3421
+VMULSDZrrbkz_Int	3422
+VMULSDZrrk_Int	3423
+VMULSDZrrkz_Int	3424
+VMULSDrm	3425
+VMULSDrm_Int	3426
+VMULSDrr	3427
+VMULSDrr_Int	3428
+VMULSHZrm	3429
+VMULSHZrm_Int	3430
+VMULSHZrmk_Int	3431
+VMULSHZrmkz_Int	3432
+VMULSHZrr	3433
+VMULSHZrr_Int	3434
+VMULSHZrrb_Int	3435
+VMULSHZrrbk_Int	3436
+VMULSHZrrbkz_Int	3437
+VMULSHZrrk_Int	3438
+VMULSHZrrkz_Int	3439
+VMULSSZrm	3440
+VMULSSZrm_Int	3441
+VMULSSZrmk_Int	3442
+VMULSSZrmkz_Int	3443
+VMULSSZrr	3444
+VMULSSZrr_Int	3445
+VMULSSZrrb_Int	3446
+VMULSSZrrbk_Int	3447
+VMULSSZrrbkz_Int	3448
+VMULSSZrrk_Int	3449
+VMULSSZrrkz_Int	3450
+VMULSSrm	3451
+VMULSSrm_Int	3452
+VMULSSrr	3453
+VMULSSrr_Int	3454
+VMWRITE	3455
+VMXOFF	3456
+VMXON	3457
+VORPDYrm	3458
+VORPDYrr	3459
+VORPDZ	3460
+VORPDZrm	3461
+VORPDZrmb	3462
+VORPDZrmbk	3463
+VORPDZrmbkz	3464
+VORPDZrmk	3465
+VORPDZrmkz	3466
+VORPDZrr	3467
+VORPDZrrk	3468
+VORPDZrrkz	3469
+VORPDrm	3470
+VORPDrr	3471
+VORPSYrm	3472
+VORPSYrr	3473
+VORPSZ	3474
+VORPSZrm	3475
+VORPSZrmb	3476
+VORPSZrmbk	3477
+VORPSZrmbkz	3478
+VORPSZrmk	3479
+VORPSZrmkz	3480
+VORPSZrr	3481
+VORPSZrrk	3482
+VORPSZrrkz	3483
+VORPSrm	3484
+VORPSrr	3485
+VP	3486
+VPABSBYrm	3487
+VPABSBYrr	3488
+VPABSBZ	3489
+VPABSBZrm	3490
+VPABSBZrmk	3491
+VPABSBZrmkz	3492
+VPABSBZrr	3493
+VPABSBZrrk	3494
+VPABSBZrrkz	3495
+VPABSBrm	3496
+VPABSBrr	3497
+VPABSDYrm	3498
+VPABSDYrr	3499
+VPABSDZ	3500
+VPABSDZrm	3501
+VPABSDZrmb	3502
+VPABSDZrmbk	3503
+VPABSDZrmbkz	3504
+VPABSDZrmk	3505
+VPABSDZrmkz	3506
+VPABSDZrr	3507
+VPABSDZrrk	3508
+VPABSDZrrkz	3509
+VPABSDrm	3510
+VPABSDrr	3511
+VPABSQZ	3512
+VPABSQZrm	3513
+VPABSQZrmb	3514
+VPABSQZrmbk	3515
+VPABSQZrmbkz	3516
+VPABSQZrmk	3517
+VPABSQZrmkz	3518
+VPABSQZrr	3519
+VPABSQZrrk	3520
+VPABSQZrrkz	3521
+VPABSWYrm	3522
+VPABSWYrr	3523
+VPABSWZ	3524
+VPABSWZrm	3525
+VPABSWZrmk	3526
+VPABSWZrmkz	3527
+VPABSWZrr	3528
+VPABSWZrrk	3529
+VPABSWZrrkz	3530
+VPABSWrm	3531
+VPABSWrr	3532
+VPACKSSDWYrm	3533
+VPACKSSDWYrr	3534
+VPACKSSDWZ	3535
+VPACKSSDWZrm	3536
+VPACKSSDWZrmb	3537
+VPACKSSDWZrmbk	3538
+VPACKSSDWZrmbkz	3539
+VPACKSSDWZrmk	3540
+VPACKSSDWZrmkz	3541
+VPACKSSDWZrr	3542
+VPACKSSDWZrrk	3543
+VPACKSSDWZrrkz	3544
+VPACKSSDWrm	3545
+VPACKSSDWrr	3546
+VPACKSSWBYrm	3547
+VPACKSSWBYrr	3548
+VPACKSSWBZ	3549
+VPACKSSWBZrm	3550
+VPACKSSWBZrmk	3551
+VPACKSSWBZrmkz	3552
+VPACKSSWBZrr	3553
+VPACKSSWBZrrk	3554
+VPACKSSWBZrrkz	3555
+VPACKSSWBrm	3556
+VPACKSSWBrr	3557
+VPACKUSDWYrm	3558
+VPACKUSDWYrr	3559
+VPACKUSDWZ	3560
+VPACKUSDWZrm	3561
+VPACKUSDWZrmb	3562
+VPACKUSDWZrmbk	3563
+VPACKUSDWZrmbkz	3564
+VPACKUSDWZrmk	3565
+VPACKUSDWZrmkz	3566
+VPACKUSDWZrr	3567
+VPACKUSDWZrrk	3568
+VPACKUSDWZrrkz	3569
+VPACKUSDWrm	3570
+VPACKUSDWrr	3571
+VPACKUSWBYrm	3572
+VPACKUSWBYrr	3573
+VPACKUSWBZ	3574
+VPACKUSWBZrm	3575
+VPACKUSWBZrmk	3576
+VPACKUSWBZrmkz	3577
+VPACKUSWBZrr	3578
+VPACKUSWBZrrk	3579
+VPACKUSWBZrrkz	3580
+VPACKUSWBrm	3581
+VPACKUSWBrr	3582
+VPADDBYrm	3583
+VPADDBYrr	3584
+VPADDBZ	3585
+VPADDBZrm	3586
+VPADDBZrmk	3587
+VPADDBZrmkz	3588
+VPADDBZrr	3589
+VPADDBZrrk	3590
+VPADDBZrrkz	3591
+VPADDBrm	3592
+VPADDBrr	3593
+VPADDDYrm	3594
+VPADDDYrr	3595
+VPADDDZ	3596
+VPADDDZrm	3597
+VPADDDZrmb	3598
+VPADDDZrmbk	3599
+VPADDDZrmbkz	3600
+VPADDDZrmk	3601
+VPADDDZrmkz	3602
+VPADDDZrr	3603
+VPADDDZrrk	3604
+VPADDDZrrkz	3605
+VPADDDrm	3606
+VPADDDrr	3607
+VPADDQYrm	3608
+VPADDQYrr	3609
+VPADDQZ	3610
+VPADDQZrm	3611
+VPADDQZrmb	3612
+VPADDQZrmbk	3613
+VPADDQZrmbkz	3614
+VPADDQZrmk	3615
+VPADDQZrmkz	3616
+VPADDQZrr	3617
+VPADDQZrrk	3618
+VPADDQZrrkz	3619
+VPADDQrm	3620
+VPADDQrr	3621
+VPADDSBYrm	3622
+VPADDSBYrr	3623
+VPADDSBZ	3624
+VPADDSBZrm	3625
+VPADDSBZrmk	3626
+VPADDSBZrmkz	3627
+VPADDSBZrr	3628
+VPADDSBZrrk	3629
+VPADDSBZrrkz	3630
+VPADDSBrm	3631
+VPADDSBrr	3632
+VPADDSWYrm	3633
+VPADDSWYrr	3634
+VPADDSWZ	3635
+VPADDSWZrm	3636
+VPADDSWZrmk	3637
+VPADDSWZrmkz	3638
+VPADDSWZrr	3639
+VPADDSWZrrk	3640
+VPADDSWZrrkz	3641
+VPADDSWrm	3642
+VPADDSWrr	3643
+VPADDUSBYrm	3644
+VPADDUSBYrr	3645
+VPADDUSBZ	3646
+VPADDUSBZrm	3647
+VPADDUSBZrmk	3648
+VPADDUSBZrmkz	3649
+VPADDUSBZrr	3650
+VPADDUSBZrrk	3651
+VPADDUSBZrrkz	3652
+VPADDUSBrm	3653
+VPADDUSBrr	3654
+VPADDUSWYrm	3655
+VPADDUSWYrr	3656
+VPADDUSWZ	3657
+VPADDUSWZrm	3658
+VPADDUSWZrmk	3659
+VPADDUSWZrmkz	3660
+VPADDUSWZrr	3661
+VPADDUSWZrrk	3662
+VPADDUSWZrrkz	3663
+VPADDUSWrm	3664
+VPADDUSWrr	3665
+VPADDWYrm	3666
+VPADDWYrr	3667
+VPADDWZ	3668
+VPADDWZrm	3669
+VPADDWZrmk	3670
+VPADDWZrmkz	3671
+VPADDWZrr	3672
+VPADDWZrrk	3673
+VPADDWZrrkz	3674
+VPADDWrm	3675
+VPADDWrr	3676
+VPALIGNRYrmi	3677
+VPALIGNRYrri	3678
+VPALIGNRZ	3679
+VPALIGNRZrmi	3680
+VPALIGNRZrmik	3681
+VPALIGNRZrmikz	3682
+VPALIGNRZrri	3683
+VPALIGNRZrrik	3684
+VPALIGNRZrrikz	3685
+VPALIGNRrmi	3686
+VPALIGNRrri	3687
+VPANDDZ	3688
+VPANDDZrm	3689
+VPANDDZrmb	3690
+VPANDDZrmbk	3691
+VPANDDZrmbkz	3692
+VPANDDZrmk	3693
+VPANDDZrmkz	3694
+VPANDDZrr	3695
+VPANDDZrrk	3696
+VPANDDZrrkz	3697
+VPANDNDZ	3698
+VPANDNDZrm	3699
+VPANDNDZrmb	3700
+VPANDNDZrmbk	3701
+VPANDNDZrmbkz	3702
+VPANDNDZrmk	3703
+VPANDNDZrmkz	3704
+VPANDNDZrr	3705
+VPANDNDZrrk	3706
+VPANDNDZrrkz	3707
+VPANDNQZ	3708
+VPANDNQZrm	3709
+VPANDNQZrmb	3710
+VPANDNQZrmbk	3711
+VPANDNQZrmbkz	3712
+VPANDNQZrmk	3713
+VPANDNQZrmkz	3714
+VPANDNQZrr	3715
+VPANDNQZrrk	3716
+VPANDNQZrrkz	3717
+VPANDNYrm	3718
+VPANDNYrr	3719
+VPANDNrm	3720
+VPANDNrr	3721
+VPANDQZ	3722
+VPANDQZrm	3723
+VPANDQZrmb	3724
+VPANDQZrmbk	3725
+VPANDQZrmbkz	3726
+VPANDQZrmk	3727
+VPANDQZrmkz	3728
+VPANDQZrr	3729
+VPANDQZrrk	3730
+VPANDQZrrkz	3731
+VPANDYrm	3732
+VPANDYrr	3733
+VPANDrm	3734
+VPANDrr	3735
+VPAVGBYrm	3736
+VPAVGBYrr	3737
+VPAVGBZ	3738
+VPAVGBZrm	3739
+VPAVGBZrmk	3740
+VPAVGBZrmkz	3741
+VPAVGBZrr	3742
+VPAVGBZrrk	3743
+VPAVGBZrrkz	3744
+VPAVGBrm	3745
+VPAVGBrr	3746
+VPAVGWYrm	3747
+VPAVGWYrr	3748
+VPAVGWZ	3749
+VPAVGWZrm	3750
+VPAVGWZrmk	3751
+VPAVGWZrmkz	3752
+VPAVGWZrr	3753
+VPAVGWZrrk	3754
+VPAVGWZrrkz	3755
+VPAVGWrm	3756
+VPAVGWrr	3757
+VPBLENDDYrmi	3758
+VPBLENDDYrri	3759
+VPBLENDDrmi	3760
+VPBLENDDrri	3761
+VPBLENDMBZ	3762
+VPBLENDMBZrm	3763
+VPBLENDMBZrmk	3764
+VPBLENDMBZrmkz	3765
+VPBLENDMBZrr	3766
+VPBLENDMBZrrk	3767
+VPBLENDMBZrrkz	3768
+VPBLENDMDZ	3769
+VPBLENDMDZrm	3770
+VPBLENDMDZrmb	3771
+VPBLENDMDZrmbk	3772
+VPBLENDMDZrmbkz	3773
+VPBLENDMDZrmk	3774
+VPBLENDMDZrmkz	3775
+VPBLENDMDZrr	3776
+VPBLENDMDZrrk	3777
+VPBLENDMDZrrkz	3778
+VPBLENDMQZ	3779
+VPBLENDMQZrm	3780
+VPBLENDMQZrmb	3781
+VPBLENDMQZrmbk	3782
+VPBLENDMQZrmbkz	3783
+VPBLENDMQZrmk	3784
+VPBLENDMQZrmkz	3785
+VPBLENDMQZrr	3786
+VPBLENDMQZrrk	3787
+VPBLENDMQZrrkz	3788
+VPBLENDMWZ	3789
+VPBLENDMWZrm	3790
+VPBLENDMWZrmk	3791
+VPBLENDMWZrmkz	3792
+VPBLENDMWZrr	3793
+VPBLENDMWZrrk	3794
+VPBLENDMWZrrkz	3795
+VPBLENDVBYrmr	3796
+VPBLENDVBYrrr	3797
+VPBLENDVBrmr	3798
+VPBLENDVBrrr	3799
+VPBLENDWYrmi	3800
+VPBLENDWYrri	3801
+VPBLENDWrmi	3802
+VPBLENDWrri	3803
+VPBROADCASTBYrm	3804
+VPBROADCASTBYrr	3805
+VPBROADCASTBZ	3806
+VPBROADCASTBZrm	3807
+VPBROADCASTBZrmk	3808
+VPBROADCASTBZrmkz	3809
+VPBROADCASTBZrr	3810
+VPBROADCASTBZrrk	3811
+VPBROADCASTBZrrkz	3812
+VPBROADCASTBrZ	3813
+VPBROADCASTBrZrr	3814
+VPBROADCASTBrZrrk	3815
+VPBROADCASTBrZrrkz	3816
+VPBROADCASTBrm	3817
+VPBROADCASTBrr	3818
+VPBROADCASTDYrm	3819
+VPBROADCASTDYrr	3820
+VPBROADCASTDZ	3821
+VPBROADCASTDZrm	3822
+VPBROADCASTDZrmk	3823
+VPBROADCASTDZrmkz	3824
+VPBROADCASTDZrr	3825
+VPBROADCASTDZrrk	3826
+VPBROADCASTDZrrkz	3827
+VPBROADCASTDrZ	3828
+VPBROADCASTDrZrr	3829
+VPBROADCASTDrZrrk	3830
+VPBROADCASTDrZrrkz	3831
+VPBROADCASTDrm	3832
+VPBROADCASTDrr	3833
+VPBROADCASTMB	3834
+VPBROADCASTMW	3835
+VPBROADCASTQYrm	3836
+VPBROADCASTQYrr	3837
+VPBROADCASTQZ	3838
+VPBROADCASTQZrm	3839
+VPBROADCASTQZrmk	3840
+VPBROADCASTQZrmkz	3841
+VPBROADCASTQZrr	3842
+VPBROADCASTQZrrk	3843
+VPBROADCASTQZrrkz	3844
+VPBROADCASTQrZ	3845
+VPBROADCASTQrZrr	3846
+VPBROADCASTQrZrrk	3847
+VPBROADCASTQrZrrkz	3848
+VPBROADCASTQrm	3849
+VPBROADCASTQrr	3850
+VPBROADCASTWYrm	3851
+VPBROADCASTWYrr	3852
+VPBROADCASTWZ	3853
+VPBROADCASTWZrm	3854
+VPBROADCASTWZrmk	3855
+VPBROADCASTWZrmkz	3856
+VPBROADCASTWZrr	3857
+VPBROADCASTWZrrk	3858
+VPBROADCASTWZrrkz	3859
+VPBROADCASTWrZ	3860
+VPBROADCASTWrZrr	3861
+VPBROADCASTWrZrrk	3862
+VPBROADCASTWrZrrkz	3863
+VPBROADCASTWrm	3864
+VPBROADCASTWrr	3865
+VPCLMULQDQYrmi	3866
+VPCLMULQDQYrri	3867
+VPCLMULQDQZ	3868
+VPCLMULQDQZrmi	3869
+VPCLMULQDQZrri	3870
+VPCLMULQDQrmi	3871
+VPCLMULQDQrri	3872
+VPCMOVYrmr	3873
+VPCMOVYrrm	3874
+VPCMOVYrrr	3875
+VPCMOVYrrr_REV	3876
+VPCMOVrmr	3877
+VPCMOVrrm	3878
+VPCMOVrrr	3879
+VPCMOVrrr_REV	3880
+VPCMPBZ	3881
+VPCMPBZrmi	3882
+VPCMPBZrmik	3883
+VPCMPBZrri	3884
+VPCMPBZrrik	3885
+VPCMPDZ	3886
+VPCMPDZrmbi	3887
+VPCMPDZrmbik	3888
+VPCMPDZrmi	3889
+VPCMPDZrmik	3890
+VPCMPDZrri	3891
+VPCMPDZrrik	3892
+VPCMPEQBYrm	3893
+VPCMPEQBYrr	3894
+VPCMPEQBZ	3895
+VPCMPEQBZrm	3896
+VPCMPEQBZrmk	3897
+VPCMPEQBZrr	3898
+VPCMPEQBZrrk	3899
+VPCMPEQBrm	3900
+VPCMPEQBrr	3901
+VPCMPEQDYrm	3902
+VPCMPEQDYrr	3903
+VPCMPEQDZ	3904
+VPCMPEQDZrm	3905
+VPCMPEQDZrmb	3906
+VPCMPEQDZrmbk	3907
+VPCMPEQDZrmk	3908
+VPCMPEQDZrr	3909
+VPCMPEQDZrrk	3910
+VPCMPEQDrm	3911
+VPCMPEQDrr	3912
+VPCMPEQQYrm	3913
+VPCMPEQQYrr	3914
+VPCMPEQQZ	3915
+VPCMPEQQZrm	3916
+VPCMPEQQZrmb	3917
+VPCMPEQQZrmbk	3918
+VPCMPEQQZrmk	3919
+VPCMPEQQZrr	3920
+VPCMPEQQZrrk	3921
+VPCMPEQQrm	3922
+VPCMPEQQrr	3923
+VPCMPEQWYrm	3924
+VPCMPEQWYrr	3925
+VPCMPEQWZ	3926
+VPCMPEQWZrm	3927
+VPCMPEQWZrmk	3928
+VPCMPEQWZrr	3929
+VPCMPEQWZrrk	3930
+VPCMPEQWrm	3931
+VPCMPEQWrr	3932
+VPCMPESTRIrmi	3933
+VPCMPESTRIrri	3934
+VPCMPESTRMrmi	3935
+VPCMPESTRMrri	3936
+VPCMPGTBYrm	3937
+VPCMPGTBYrr	3938
+VPCMPGTBZ	3939
+VPCMPGTBZrm	3940
+VPCMPGTBZrmk	3941
+VPCMPGTBZrr	3942
+VPCMPGTBZrrk	3943
+VPCMPGTBrm	3944
+VPCMPGTBrr	3945
+VPCMPGTDYrm	3946
+VPCMPGTDYrr	3947
+VPCMPGTDZ	3948
+VPCMPGTDZrm	3949
+VPCMPGTDZrmb	3950
+VPCMPGTDZrmbk	3951
+VPCMPGTDZrmk	3952
+VPCMPGTDZrr	3953
+VPCMPGTDZrrk	3954
+VPCMPGTDrm	3955
+VPCMPGTDrr	3956
+VPCMPGTQYrm	3957
+VPCMPGTQYrr	3958
+VPCMPGTQZ	3959
+VPCMPGTQZrm	3960
+VPCMPGTQZrmb	3961
+VPCMPGTQZrmbk	3962
+VPCMPGTQZrmk	3963
+VPCMPGTQZrr	3964
+VPCMPGTQZrrk	3965
+VPCMPGTQrm	3966
+VPCMPGTQrr	3967
+VPCMPGTWYrm	3968
+VPCMPGTWYrr	3969
+VPCMPGTWZ	3970
+VPCMPGTWZrm	3971
+VPCMPGTWZrmk	3972
+VPCMPGTWZrr	3973
+VPCMPGTWZrrk	3974
+VPCMPGTWrm	3975
+VPCMPGTWrr	3976
+VPCMPISTRIrmi	3977
+VPCMPISTRIrri	3978
+VPCMPISTRMrmi	3979
+VPCMPISTRMrri	3980
+VPCMPQZ	3981
+VPCMPQZrmbi	3982
+VPCMPQZrmbik	3983
+VPCMPQZrmi	3984
+VPCMPQZrmik	3985
+VPCMPQZrri	3986
+VPCMPQZrrik	3987
+VPCMPUBZ	3988
+VPCMPUBZrmi	3989
+VPCMPUBZrmik	3990
+VPCMPUBZrri	3991
+VPCMPUBZrrik	3992
+VPCMPUDZ	3993
+VPCMPUDZrmbi	3994
+VPCMPUDZrmbik	3995
+VPCMPUDZrmi	3996
+VPCMPUDZrmik	3997
+VPCMPUDZrri	3998
+VPCMPUDZrrik	3999
+VPCMPUQZ	4000
+VPCMPUQZrmbi	4001
+VPCMPUQZrmbik	4002
+VPCMPUQZrmi	4003
+VPCMPUQZrmik	4004
+VPCMPUQZrri	4005
+VPCMPUQZrrik	4006
+VPCMPUWZ	4007
+VPCMPUWZrmi	4008
+VPCMPUWZrmik	4009
+VPCMPUWZrri	4010
+VPCMPUWZrrik	4011
+VPCMPWZ	4012
+VPCMPWZrmi	4013
+VPCMPWZrmik	4014
+VPCMPWZrri	4015
+VPCMPWZrrik	4016
+VPCOMBmi	4017
+VPCOMBri	4018
+VPCOMDmi	4019
+VPCOMDri	4020
+VPCOMPRESSBZ	4021
+VPCOMPRESSBZmr	4022
+VPCOMPRESSBZmrk	4023
+VPCOMPRESSBZrr	4024
+VPCOMPRESSBZrrk	4025
+VPCOMPRESSBZrrkz	4026
+VPCOMPRESSDZ	4027
+VPCOMPRESSDZmr	4028
+VPCOMPRESSDZmrk	4029
+VPCOMPRESSDZrr	4030
+VPCOMPRESSDZrrk	4031
+VPCOMPRESSDZrrkz	4032
+VPCOMPRESSQZ	4033
+VPCOMPRESSQZmr	4034
+VPCOMPRESSQZmrk	4035
+VPCOMPRESSQZrr	4036
+VPCOMPRESSQZrrk	4037
+VPCOMPRESSQZrrkz	4038
+VPCOMPRESSWZ	4039
+VPCOMPRESSWZmr	4040
+VPCOMPRESSWZmrk	4041
+VPCOMPRESSWZrr	4042
+VPCOMPRESSWZrrk	4043
+VPCOMPRESSWZrrkz	4044
+VPCOMQmi	4045
+VPCOMQri	4046
+VPCOMUBmi	4047
+VPCOMUBri	4048
+VPCOMUDmi	4049
+VPCOMUDri	4050
+VPCOMUQmi	4051
+VPCOMUQri	4052
+VPCOMUWmi	4053
+VPCOMUWri	4054
+VPCOMWmi	4055
+VPCOMWri	4056
+VPCONFLICTDZ	4057
+VPCONFLICTDZrm	4058
+VPCONFLICTDZrmb	4059
+VPCONFLICTDZrmbk	4060
+VPCONFLICTDZrmbkz	4061
+VPCONFLICTDZrmk	4062
+VPCONFLICTDZrmkz	4063
+VPCONFLICTDZrr	4064
+VPCONFLICTDZrrk	4065
+VPCONFLICTDZrrkz	4066
+VPCONFLICTQZ	4067
+VPCONFLICTQZrm	4068
+VPCONFLICTQZrmb	4069
+VPCONFLICTQZrmbk	4070
+VPCONFLICTQZrmbkz	4071
+VPCONFLICTQZrmk	4072
+VPCONFLICTQZrmkz	4073
+VPCONFLICTQZrr	4074
+VPCONFLICTQZrrk	4075
+VPCONFLICTQZrrkz	4076
+VPDPBSSDSYrm	4077
+VPDPBSSDSYrr	4078
+VPDPBSSDSZ	4079
+VPDPBSSDSZrm	4080
+VPDPBSSDSZrmb	4081
+VPDPBSSDSZrmbk	4082
+VPDPBSSDSZrmbkz	4083
+VPDPBSSDSZrmk	4084
+VPDPBSSDSZrmkz	4085
+VPDPBSSDSZrr	4086
+VPDPBSSDSZrrk	4087
+VPDPBSSDSZrrkz	4088
+VPDPBSSDSrm	4089
+VPDPBSSDSrr	4090
+VPDPBSSDYrm	4091
+VPDPBSSDYrr	4092
+VPDPBSSDZ	4093
+VPDPBSSDZrm	4094
+VPDPBSSDZrmb	4095
+VPDPBSSDZrmbk	4096
+VPDPBSSDZrmbkz	4097
+VPDPBSSDZrmk	4098
+VPDPBSSDZrmkz	4099
+VPDPBSSDZrr	4100
+VPDPBSSDZrrk	4101
+VPDPBSSDZrrkz	4102
+VPDPBSSDrm	4103
+VPDPBSSDrr	4104
+VPDPBSUDSYrm	4105
+VPDPBSUDSYrr	4106
+VPDPBSUDSZ	4107
+VPDPBSUDSZrm	4108
+VPDPBSUDSZrmb	4109
+VPDPBSUDSZrmbk	4110
+VPDPBSUDSZrmbkz	4111
+VPDPBSUDSZrmk	4112
+VPDPBSUDSZrmkz	4113
+VPDPBSUDSZrr	4114
+VPDPBSUDSZrrk	4115
+VPDPBSUDSZrrkz	4116
+VPDPBSUDSrm	4117
+VPDPBSUDSrr	4118
+VPDPBSUDYrm	4119
+VPDPBSUDYrr	4120
+VPDPBSUDZ	4121
+VPDPBSUDZrm	4122
+VPDPBSUDZrmb	4123
+VPDPBSUDZrmbk	4124
+VPDPBSUDZrmbkz	4125
+VPDPBSUDZrmk	4126
+VPDPBSUDZrmkz	4127
+VPDPBSUDZrr	4128
+VPDPBSUDZrrk	4129
+VPDPBSUDZrrkz	4130
+VPDPBSUDrm	4131
+VPDPBSUDrr	4132
+VPDPBUSDSYrm	4133
+VPDPBUSDSYrr	4134
+VPDPBUSDSZ	4135
+VPDPBUSDSZrm	4136
+VPDPBUSDSZrmb	4137
+VPDPBUSDSZrmbk	4138
+VPDPBUSDSZrmbkz	4139
+VPDPBUSDSZrmk	4140
+VPDPBUSDSZrmkz	4141
+VPDPBUSDSZrr	4142
+VPDPBUSDSZrrk	4143
+VPDPBUSDSZrrkz	4144
+VPDPBUSDSrm	4145
+VPDPBUSDSrr	4146
+VPDPBUSDYrm	4147
+VPDPBUSDYrr	4148
+VPDPBUSDZ	4149
+VPDPBUSDZrm	4150
+VPDPBUSDZrmb	4151
+VPDPBUSDZrmbk	4152
+VPDPBUSDZrmbkz	4153
+VPDPBUSDZrmk	4154
+VPDPBUSDZrmkz	4155
+VPDPBUSDZrr	4156
+VPDPBUSDZrrk	4157
+VPDPBUSDZrrkz	4158
+VPDPBUSDrm	4159
+VPDPBUSDrr	4160
+VPDPBUUDSYrm	4161
+VPDPBUUDSYrr	4162
+VPDPBUUDSZ	4163
+VPDPBUUDSZrm	4164
+VPDPBUUDSZrmb	4165
+VPDPBUUDSZrmbk	4166
+VPDPBUUDSZrmbkz	4167
+VPDPBUUDSZrmk	4168
+VPDPBUUDSZrmkz	4169
+VPDPBUUDSZrr	4170
+VPDPBUUDSZrrk	4171
+VPDPBUUDSZrrkz	4172
+VPDPBUUDSrm	4173
+VPDPBUUDSrr	4174
+VPDPBUUDYrm	4175
+VPDPBUUDYrr	4176
+VPDPBUUDZ	4177
+VPDPBUUDZrm	4178
+VPDPBUUDZrmb	4179
+VPDPBUUDZrmbk	4180
+VPDPBUUDZrmbkz	4181
+VPDPBUUDZrmk	4182
+VPDPBUUDZrmkz	4183
+VPDPBUUDZrr	4184
+VPDPBUUDZrrk	4185
+VPDPBUUDZrrkz	4186
+VPDPBUUDrm	4187
+VPDPBUUDrr	4188
+VPDPWSSDSYrm	4189
+VPDPWSSDSYrr	4190
+VPDPWSSDSZ	4191
+VPDPWSSDSZrm	4192
+VPDPWSSDSZrmb	4193
+VPDPWSSDSZrmbk	4194
+VPDPWSSDSZrmbkz	4195
+VPDPWSSDSZrmk	4196
+VPDPWSSDSZrmkz	4197
+VPDPWSSDSZrr	4198
+VPDPWSSDSZrrk	4199
+VPDPWSSDSZrrkz	4200
+VPDPWSSDSrm	4201
+VPDPWSSDSrr	4202
+VPDPWSSDYrm	4203
+VPDPWSSDYrr	4204
+VPDPWSSDZ	4205
+VPDPWSSDZrm	4206
+VPDPWSSDZrmb	4207
+VPDPWSSDZrmbk	4208
+VPDPWSSDZrmbkz	4209
+VPDPWSSDZrmk	4210
+VPDPWSSDZrmkz	4211
+VPDPWSSDZrr	4212
+VPDPWSSDZrrk	4213
+VPDPWSSDZrrkz	4214
+VPDPWSSDrm	4215
+VPDPWSSDrr	4216
+VPDPWSUDSYrm	4217
+VPDPWSUDSYrr	4218
+VPDPWSUDSZ	4219
+VPDPWSUDSZrm	4220
+VPDPWSUDSZrmb	4221
+VPDPWSUDSZrmbk	4222
+VPDPWSUDSZrmbkz	4223
+VPDPWSUDSZrmk	4224
+VPDPWSUDSZrmkz	4225
+VPDPWSUDSZrr	4226
+VPDPWSUDSZrrk	4227
+VPDPWSUDSZrrkz	4228
+VPDPWSUDSrm	4229
+VPDPWSUDSrr	4230
+VPDPWSUDYrm	4231
+VPDPWSUDYrr	4232
+VPDPWSUDZ	4233
+VPDPWSUDZrm	4234
+VPDPWSUDZrmb	4235
+VPDPWSUDZrmbk	4236
+VPDPWSUDZrmbkz	4237
+VPDPWSUDZrmk	4238
+VPDPWSUDZrmkz	4239
+VPDPWSUDZrr	4240
+VPDPWSUDZrrk	4241
+VPDPWSUDZrrkz	4242
+VPDPWSUDrm	4243
+VPDPWSUDrr	4244
+VPDPWUSDSYrm	4245
+VPDPWUSDSYrr	4246
+VPDPWUSDSZ	4247
+VPDPWUSDSZrm	4248
+VPDPWUSDSZrmb	4249
+VPDPWUSDSZrmbk	4250
+VPDPWUSDSZrmbkz	4251
+VPDPWUSDSZrmk	4252
+VPDPWUSDSZrmkz	4253
+VPDPWUSDSZrr	4254
+VPDPWUSDSZrrk	4255
+VPDPWUSDSZrrkz	4256
+VPDPWUSDSrm	4257
+VPDPWUSDSrr	4258
+VPDPWUSDYrm	4259
+VPDPWUSDYrr	4260
+VPDPWUSDZ	4261
+VPDPWUSDZrm	4262
+VPDPWUSDZrmb	4263
+VPDPWUSDZrmbk	4264
+VPDPWUSDZrmbkz	4265
+VPDPWUSDZrmk	4266
+VPDPWUSDZrmkz	4267
+VPDPWUSDZrr	4268
+VPDPWUSDZrrk	4269
+VPDPWUSDZrrkz	4270
+VPDPWUSDrm	4271
+VPDPWUSDrr	4272
+VPDPWUUDSYrm	4273
+VPDPWUUDSYrr	4274
+VPDPWUUDSZ	4275
+VPDPWUUDSZrm	4276
+VPDPWUUDSZrmb	4277
+VPDPWUUDSZrmbk	4278
+VPDPWUUDSZrmbkz	4279
+VPDPWUUDSZrmk	4280
+VPDPWUUDSZrmkz	4281
+VPDPWUUDSZrr	4282
+VPDPWUUDSZrrk	4283
+VPDPWUUDSZrrkz	4284
+VPDPWUUDSrm	4285
+VPDPWUUDSrr	4286
+VPDPWUUDYrm	4287
+VPDPWUUDYrr	4288
+VPDPWUUDZ	4289
+VPDPWUUDZrm	4290
+VPDPWUUDZrmb	4291
+VPDPWUUDZrmbk	4292
+VPDPWUUDZrmbkz	4293
+VPDPWUUDZrmk	4294
+VPDPWUUDZrmkz	4295
+VPDPWUUDZrr	4296
+VPDPWUUDZrrk	4297
+VPDPWUUDZrrkz	4298
+VPDPWUUDrm	4299
+VPDPWUUDrr	4300
+VPERM	4301
+VPERMBZ	4302
+VPERMBZrm	4303
+VPERMBZrmk	4304
+VPERMBZrmkz	4305
+VPERMBZrr	4306
+VPERMBZrrk	4307
+VPERMBZrrkz	4308
+VPERMDYrm	4309
+VPERMDYrr	4310
+VPERMDZ	4311
+VPERMDZrm	4312
+VPERMDZrmb	4313
+VPERMDZrmbk	4314
+VPERMDZrmbkz	4315
+VPERMDZrmk	4316
+VPERMDZrmkz	4317
+VPERMDZrr	4318
+VPERMDZrrk	4319
+VPERMDZrrkz	4320
+VPERMI	4321
+VPERMIL	4322
+VPERMILPDYmi	4323
+VPERMILPDYri	4324
+VPERMILPDYrm	4325
+VPERMILPDYrr	4326
+VPERMILPDZ	4327
+VPERMILPDZmbi	4328
+VPERMILPDZmbik	4329
+VPERMILPDZmbikz	4330
+VPERMILPDZmi	4331
+VPERMILPDZmik	4332
+VPERMILPDZmikz	4333
+VPERMILPDZri	4334
+VPERMILPDZrik	4335
+VPERMILPDZrikz	4336
+VPERMILPDZrm	4337
+VPERMILPDZrmb	4338
+VPERMILPDZrmbk	4339
+VPERMILPDZrmbkz	4340
+VPERMILPDZrmk	4341
+VPERMILPDZrmkz	4342
+VPERMILPDZrr	4343
+VPERMILPDZrrk	4344
+VPERMILPDZrrkz	4345
+VPERMILPDmi	4346
+VPERMILPDri	4347
+VPERMILPDrm	4348
+VPERMILPDrr	4349
+VPERMILPSYmi	4350
+VPERMILPSYri	4351
+VPERMILPSYrm	4352
+VPERMILPSYrr	4353
+VPERMILPSZ	4354
+VPERMILPSZmbi	4355
+VPERMILPSZmbik	4356
+VPERMILPSZmbikz	4357
+VPERMILPSZmi	4358
+VPERMILPSZmik	4359
+VPERMILPSZmikz	4360
+VPERMILPSZri	4361
+VPERMILPSZrik	4362
+VPERMILPSZrikz	4363
+VPERMILPSZrm	4364
+VPERMILPSZrmb	4365
+VPERMILPSZrmbk	4366
+VPERMILPSZrmbkz	4367
+VPERMILPSZrmk	4368
+VPERMILPSZrmkz	4369
+VPERMILPSZrr	4370
+VPERMILPSZrrk	4371
+VPERMILPSZrrkz	4372
+VPERMILPSmi	4373
+VPERMILPSri	4374
+VPERMILPSrm	4375
+VPERMILPSrr	4376
+VPERMPDYmi	4377
+VPERMPDYri	4378
+VPERMPDZ	4379
+VPERMPDZmbi	4380
+VPERMPDZmbik	4381
+VPERMPDZmbikz	4382
+VPERMPDZmi	4383
+VPERMPDZmik	4384
+VPERMPDZmikz	4385
+VPERMPDZri	4386
+VPERMPDZrik	4387
+VPERMPDZrikz	4388
+VPERMPDZrm	4389
+VPERMPDZrmb	4390
+VPERMPDZrmbk	4391
+VPERMPDZrmbkz	4392
+VPERMPDZrmk	4393
+VPERMPDZrmkz	4394
+VPERMPDZrr	4395
+VPERMPDZrrk	4396
+VPERMPDZrrkz	4397
+VPERMPSYrm	4398
+VPERMPSYrr	4399
+VPERMPSZ	4400
+VPERMPSZrm	4401
+VPERMPSZrmb	4402
+VPERMPSZrmbk	4403
+VPERMPSZrmbkz	4404
+VPERMPSZrmk	4405
+VPERMPSZrmkz	4406
+VPERMPSZrr	4407
+VPERMPSZrrk	4408
+VPERMPSZrrkz	4409
+VPERMQYmi	4410
+VPERMQYri	4411
+VPERMQZ	4412
+VPERMQZmbi	4413
+VPERMQZmbik	4414
+VPERMQZmbikz	4415
+VPERMQZmi	4416
+VPERMQZmik	4417
+VPERMQZmikz	4418
+VPERMQZri	4419
+VPERMQZrik	4420
+VPERMQZrikz	4421
+VPERMQZrm	4422
+VPERMQZrmb	4423
+VPERMQZrmbk	4424
+VPERMQZrmbkz	4425
+VPERMQZrmk	4426
+VPERMQZrmkz	4427
+VPERMQZrr	4428
+VPERMQZrrk	4429
+VPERMQZrrkz	4430
+VPERMT	4431
+VPERMWZ	4432
+VPERMWZrm	4433
+VPERMWZrmk	4434
+VPERMWZrmkz	4435
+VPERMWZrr	4436
+VPERMWZrrk	4437
+VPERMWZrrkz	4438
+VPEXPANDBZ	4439
+VPEXPANDBZrm	4440
+VPEXPANDBZrmk	4441
+VPEXPANDBZrmkz	4442
+VPEXPANDBZrr	4443
+VPEXPANDBZrrk	4444
+VPEXPANDBZrrkz	4445
+VPEXPANDDZ	4446
+VPEXPANDDZrm	4447
+VPEXPANDDZrmk	4448
+VPEXPANDDZrmkz	4449
+VPEXPANDDZrr	4450
+VPEXPANDDZrrk	4451
+VPEXPANDDZrrkz	4452
+VPEXPANDQZ	4453
+VPEXPANDQZrm	4454
+VPEXPANDQZrmk	4455
+VPEXPANDQZrmkz	4456
+VPEXPANDQZrr	4457
+VPEXPANDQZrrk	4458
+VPEXPANDQZrrkz	4459
+VPEXPANDWZ	4460
+VPEXPANDWZrm	4461
+VPEXPANDWZrmk	4462
+VPEXPANDWZrmkz	4463
+VPEXPANDWZrr	4464
+VPEXPANDWZrrk	4465
+VPEXPANDWZrrkz	4466
+VPEXTRBZmri	4467
+VPEXTRBZrri	4468
+VPEXTRBmri	4469
+VPEXTRBrri	4470
+VPEXTRDZmri	4471
+VPEXTRDZrri	4472
+VPEXTRDmri	4473
+VPEXTRDrri	4474
+VPEXTRQZmri	4475
+VPEXTRQZrri	4476
+VPEXTRQmri	4477
+VPEXTRQrri	4478
+VPEXTRWZmri	4479
+VPEXTRWZrri	4480
+VPEXTRWZrri_REV	4481
+VPEXTRWmri	4482
+VPEXTRWrri	4483
+VPEXTRWrri_REV	4484
+VPGATHERDDYrm	4485
+VPGATHERDDZ	4486
+VPGATHERDDZrm	4487
+VPGATHERDDrm	4488
+VPGATHERDQYrm	4489
+VPGATHERDQZ	4490
+VPGATHERDQZrm	4491
+VPGATHERDQrm	4492
+VPGATHERQDYrm	4493
+VPGATHERQDZ	4494
+VPGATHERQDZrm	4495
+VPGATHERQDrm	4496
+VPGATHERQQYrm	4497
+VPGATHERQQZ	4498
+VPGATHERQQZrm	4499
+VPGATHERQQrm	4500
+VPHADDBDrm	4501
+VPHADDBDrr	4502
+VPHADDBQrm	4503
+VPHADDBQrr	4504
+VPHADDBWrm	4505
+VPHADDBWrr	4506
+VPHADDDQrm	4507
+VPHADDDQrr	4508
+VPHADDDYrm	4509
+VPHADDDYrr	4510
+VPHADDDrm	4511
+VPHADDDrr	4512
+VPHADDSWYrm	4513
+VPHADDSWYrr	4514
+VPHADDSWrm	4515
+VPHADDSWrr	4516
+VPHADDUBDrm	4517
+VPHADDUBDrr	4518
+VPHADDUBQrm	4519
+VPHADDUBQrr	4520
+VPHADDUBWrm	4521
+VPHADDUBWrr	4522
+VPHADDUDQrm	4523
+VPHADDUDQrr	4524
+VPHADDUWDrm	4525
+VPHADDUWDrr	4526
+VPHADDUWQrm	4527
+VPHADDUWQrr	4528
+VPHADDWDrm	4529
+VPHADDWDrr	4530
+VPHADDWQrm	4531
+VPHADDWQrr	4532
+VPHADDWYrm	4533
+VPHADDWYrr	4534
+VPHADDWrm	4535
+VPHADDWrr	4536
+VPHMINPOSUWrm	4537
+VPHMINPOSUWrr	4538
+VPHSUBBWrm	4539
+VPHSUBBWrr	4540
+VPHSUBDQrm	4541
+VPHSUBDQrr	4542
+VPHSUBDYrm	4543
+VPHSUBDYrr	4544
+VPHSUBDrm	4545
+VPHSUBDrr	4546
+VPHSUBSWYrm	4547
+VPHSUBSWYrr	4548
+VPHSUBSWrm	4549
+VPHSUBSWrr	4550
+VPHSUBWDrm	4551
+VPHSUBWDrr	4552
+VPHSUBWYrm	4553
+VPHSUBWYrr	4554
+VPHSUBWrm	4555
+VPHSUBWrr	4556
+VPINSRBZrmi	4557
+VPINSRBZrri	4558
+VPINSRBrmi	4559
+VPINSRBrri	4560
+VPINSRDZrmi	4561
+VPINSRDZrri	4562
+VPINSRDrmi	4563
+VPINSRDrri	4564
+VPINSRQZrmi	4565
+VPINSRQZrri	4566
+VPINSRQrmi	4567
+VPINSRQrri	4568
+VPINSRWZrmi	4569
+VPINSRWZrri	4570
+VPINSRWrmi	4571
+VPINSRWrri	4572
+VPLZCNTDZ	4573
+VPLZCNTDZrm	4574
+VPLZCNTDZrmb	4575
+VPLZCNTDZrmbk	4576
+VPLZCNTDZrmbkz	4577
+VPLZCNTDZrmk	4578
+VPLZCNTDZrmkz	4579
+VPLZCNTDZrr	4580
+VPLZCNTDZrrk	4581
+VPLZCNTDZrrkz	4582
+VPLZCNTQZ	4583
+VPLZCNTQZrm	4584
+VPLZCNTQZrmb	4585
+VPLZCNTQZrmbk	4586
+VPLZCNTQZrmbkz	4587
+VPLZCNTQZrmk	4588
+VPLZCNTQZrmkz	4589
+VPLZCNTQZrr	4590
+VPLZCNTQZrrk	4591
+VPLZCNTQZrrkz	4592
+VPMACSDDrm	4593
+VPMACSDDrr	4594
+VPMACSDQHrm	4595
+VPMACSDQHrr	4596
+VPMACSDQLrm	4597
+VPMACSDQLrr	4598
+VPMACSSDDrm	4599
+VPMACSSDDrr	4600
+VPMACSSDQHrm	4601
+VPMACSSDQHrr	4602
+VPMACSSDQLrm	4603
+VPMACSSDQLrr	4604
+VPMACSSWDrm	4605
+VPMACSSWDrr	4606
+VPMACSSWWrm	4607
+VPMACSSWWrr	4608
+VPMACSWDrm	4609
+VPMACSWDrr	4610
+VPMACSWWrm	4611
+VPMACSWWrr	4612
+VPMADCSSWDrm	4613
+VPMADCSSWDrr	4614
+VPMADCSWDrm	4615
+VPMADCSWDrr	4616
+VPMADD	4617
+VPMADDUBSWYrm	4618
+VPMADDUBSWYrr	4619
+VPMADDUBSWZ	4620
+VPMADDUBSWZrm	4621
+VPMADDUBSWZrmk	4622
+VPMADDUBSWZrmkz	4623
+VPMADDUBSWZrr	4624
+VPMADDUBSWZrrk	4625
+VPMADDUBSWZrrkz	4626
+VPMADDUBSWrm	4627
+VPMADDUBSWrr	4628
+VPMADDWDYrm	4629
+VPMADDWDYrr	4630
+VPMADDWDZ	4631
+VPMADDWDZrm	4632
+VPMADDWDZrmk	4633
+VPMADDWDZrmkz	4634
+VPMADDWDZrr	4635
+VPMADDWDZrrk	4636
+VPMADDWDZrrkz	4637
+VPMADDWDrm	4638
+VPMADDWDrr	4639
+VPMASKMOVDYmr	4640
+VPMASKMOVDYrm	4641
+VPMASKMOVDmr	4642
+VPMASKMOVDrm	4643
+VPMASKMOVQYmr	4644
+VPMASKMOVQYrm	4645
+VPMASKMOVQmr	4646
+VPMASKMOVQrm	4647
+VPMAXSBYrm	4648
+VPMAXSBYrr	4649
+VPMAXSBZ	4650
+VPMAXSBZrm	4651
+VPMAXSBZrmk	4652
+VPMAXSBZrmkz	4653
+VPMAXSBZrr	4654
+VPMAXSBZrrk	4655
+VPMAXSBZrrkz	4656
+VPMAXSBrm	4657
+VPMAXSBrr	4658
+VPMAXSDYrm	4659
+VPMAXSDYrr	4660
+VPMAXSDZ	4661
+VPMAXSDZrm	4662
+VPMAXSDZrmb	4663
+VPMAXSDZrmbk	4664
+VPMAXSDZrmbkz	4665
+VPMAXSDZrmk	4666
+VPMAXSDZrmkz	4667
+VPMAXSDZrr	4668
+VPMAXSDZrrk	4669
+VPMAXSDZrrkz	4670
+VPMAXSDrm	4671
+VPMAXSDrr	4672
+VPMAXSQZ	4673
+VPMAXSQZrm	4674
+VPMAXSQZrmb	4675
+VPMAXSQZrmbk	4676
+VPMAXSQZrmbkz	4677
+VPMAXSQZrmk	4678
+VPMAXSQZrmkz	4679
+VPMAXSQZrr	4680
+VPMAXSQZrrk	4681
+VPMAXSQZrrkz	4682
+VPMAXSWYrm	4683
+VPMAXSWYrr	4684
+VPMAXSWZ	4685
+VPMAXSWZrm	4686
+VPMAXSWZrmk	4687
+VPMAXSWZrmkz	4688
+VPMAXSWZrr	4689
+VPMAXSWZrrk	4690
+VPMAXSWZrrkz	4691
+VPMAXSWrm	4692
+VPMAXSWrr	4693
+VPMAXUBYrm	4694
+VPMAXUBYrr	4695
+VPMAXUBZ	4696
+VPMAXUBZrm	4697
+VPMAXUBZrmk	4698
+VPMAXUBZrmkz	4699
+VPMAXUBZrr	4700
+VPMAXUBZrrk	4701
+VPMAXUBZrrkz	4702
+VPMAXUBrm	4703
+VPMAXUBrr	4704
+VPMAXUDYrm	4705
+VPMAXUDYrr	4706
+VPMAXUDZ	4707
+VPMAXUDZrm	4708
+VPMAXUDZrmb	4709
+VPMAXUDZrmbk	4710
+VPMAXUDZrmbkz	4711
+VPMAXUDZrmk	4712
+VPMAXUDZrmkz	4713
+VPMAXUDZrr	4714
+VPMAXUDZrrk	4715
+VPMAXUDZrrkz	4716
+VPMAXUDrm	4717
+VPMAXUDrr	4718
+VPMAXUQZ	4719
+VPMAXUQZrm	4720
+VPMAXUQZrmb	4721
+VPMAXUQZrmbk	4722
+VPMAXUQZrmbkz	4723
+VPMAXUQZrmk	4724
+VPMAXUQZrmkz	4725
+VPMAXUQZrr	4726
+VPMAXUQZrrk	4727
+VPMAXUQZrrkz	4728
+VPMAXUWYrm	4729
+VPMAXUWYrr	4730
+VPMAXUWZ	4731
+VPMAXUWZrm	4732
+VPMAXUWZrmk	4733
+VPMAXUWZrmkz	4734
+VPMAXUWZrr	4735
+VPMAXUWZrrk	4736
+VPMAXUWZrrkz	4737
+VPMAXUWrm	4738
+VPMAXUWrr	4739
+VPMINSBYrm	4740
+VPMINSBYrr	4741
+VPMINSBZ	4742
+VPMINSBZrm	4743
+VPMINSBZrmk	4744
+VPMINSBZrmkz	4745
+VPMINSBZrr	4746
+VPMINSBZrrk	4747
+VPMINSBZrrkz	4748
+VPMINSBrm	4749
+VPMINSBrr	4750
+VPMINSDYrm	4751
+VPMINSDYrr	4752
+VPMINSDZ	4753
+VPMINSDZrm	4754
+VPMINSDZrmb	4755
+VPMINSDZrmbk	4756
+VPMINSDZrmbkz	4757
+VPMINSDZrmk	4758
+VPMINSDZrmkz	4759
+VPMINSDZrr	4760
+VPMINSDZrrk	4761
+VPMINSDZrrkz	4762
+VPMINSDrm	4763
+VPMINSDrr	4764
+VPMINSQZ	4765
+VPMINSQZrm	4766
+VPMINSQZrmb	4767
+VPMINSQZrmbk	4768
+VPMINSQZrmbkz	4769
+VPMINSQZrmk	4770
+VPMINSQZrmkz	4771
+VPMINSQZrr	4772
+VPMINSQZrrk	4773
+VPMINSQZrrkz	4774
+VPMINSWYrm	4775
+VPMINSWYrr	4776
+VPMINSWZ	4777
+VPMINSWZrm	4778
+VPMINSWZrmk	4779
+VPMINSWZrmkz	4780
+VPMINSWZrr	4781
+VPMINSWZrrk	4782
+VPMINSWZrrkz	4783
+VPMINSWrm	4784
+VPMINSWrr	4785
+VPMINUBYrm	4786
+VPMINUBYrr	4787
+VPMINUBZ	4788
+VPMINUBZrm	4789
+VPMINUBZrmk	4790
+VPMINUBZrmkz	4791
+VPMINUBZrr	4792
+VPMINUBZrrk	4793
+VPMINUBZrrkz	4794
+VPMINUBrm	4795
+VPMINUBrr	4796
+VPMINUDYrm	4797
+VPMINUDYrr	4798
+VPMINUDZ	4799
+VPMINUDZrm	4800
+VPMINUDZrmb	4801
+VPMINUDZrmbk	4802
+VPMINUDZrmbkz	4803
+VPMINUDZrmk	4804
+VPMINUDZrmkz	4805
+VPMINUDZrr	4806
+VPMINUDZrrk	4807
+VPMINUDZrrkz	4808
+VPMINUDrm	4809
+VPMINUDrr	4810
+VPMINUQZ	4811
+VPMINUQZrm	4812
+VPMINUQZrmb	4813
+VPMINUQZrmbk	4814
+VPMINUQZrmbkz	4815
+VPMINUQZrmk	4816
+VPMINUQZrmkz	4817
+VPMINUQZrr	4818
+VPMINUQZrrk	4819
+VPMINUQZrrkz	4820
+VPMINUWYrm	4821
+VPMINUWYrr	4822
+VPMINUWZ	4823
+VPMINUWZrm	4824
+VPMINUWZrmk	4825
+VPMINUWZrmkz	4826
+VPMINUWZrr	4827
+VPMINUWZrrk	4828
+VPMINUWZrrkz	4829
+VPMINUWrm	4830
+VPMINUWrr	4831
+VPMOVB	4832
+VPMOVD	4833
+VPMOVDBZ	4834
+VPMOVDBZmr	4835
+VPMOVDBZmrk	4836
+VPMOVDBZrr	4837
+VPMOVDBZrrk	4838
+VPMOVDBZrrkz	4839
+VPMOVDWZ	4840
+VPMOVDWZmr	4841
+VPMOVDWZmrk	4842
+VPMOVDWZrr	4843
+VPMOVDWZrrk	4844
+VPMOVDWZrrkz	4845
+VPMOVM	4846
+VPMOVMSKBYrr	4847
+VPMOVMSKBrr	4848
+VPMOVQ	4849
+VPMOVQBZ	4850
+VPMOVQBZmr	4851
+VPMOVQBZmrk	4852
+VPMOVQBZrr	4853
+VPMOVQBZrrk	4854
+VPMOVQBZrrkz	4855
+VPMOVQDZ	4856
+VPMOVQDZmr	4857
+VPMOVQDZmrk	4858
+VPMOVQDZrr	4859
+VPMOVQDZrrk	4860
+VPMOVQDZrrkz	4861
+VPMOVQWZ	4862
+VPMOVQWZmr	4863
+VPMOVQWZmrk	4864
+VPMOVQWZrr	4865
+VPMOVQWZrrk	4866
+VPMOVQWZrrkz	4867
+VPMOVSDBZ	4868
+VPMOVSDBZmr	4869
+VPMOVSDBZmrk	4870
+VPMOVSDBZrr	4871
+VPMOVSDBZrrk	4872
+VPMOVSDBZrrkz	4873
+VPMOVSDWZ	4874
+VPMOVSDWZmr	4875
+VPMOVSDWZmrk	4876
+VPMOVSDWZrr	4877
+VPMOVSDWZrrk	4878
+VPMOVSDWZrrkz	4879
+VPMOVSQBZ	4880
+VPMOVSQBZmr	4881
+VPMOVSQBZmrk	4882
+VPMOVSQBZrr	4883
+VPMOVSQBZrrk	4884
+VPMOVSQBZrrkz	4885
+VPMOVSQDZ	4886
+VPMOVSQDZmr	4887
+VPMOVSQDZmrk	4888
+VPMOVSQDZrr	4889
+VPMOVSQDZrrk	4890
+VPMOVSQDZrrkz	4891
+VPMOVSQWZ	4892
+VPMOVSQWZmr	4893
+VPMOVSQWZmrk	4894
+VPMOVSQWZrr	4895
+VPMOVSQWZrrk	4896
+VPMOVSQWZrrkz	4897
+VPMOVSWBZ	4898
+VPMOVSWBZmr	4899
+VPMOVSWBZmrk	4900
+VPMOVSWBZrr	4901
+VPMOVSWBZrrk	4902
+VPMOVSWBZrrkz	4903
+VPMOVSXBDYrm	4904
+VPMOVSXBDYrr	4905
+VPMOVSXBDZ	4906
+VPMOVSXBDZrm	4907
+VPMOVSXBDZrmk	4908
+VPMOVSXBDZrmkz	4909
+VPMOVSXBDZrr	4910
+VPMOVSXBDZrrk	4911
+VPMOVSXBDZrrkz	4912
+VPMOVSXBDrm	4913
+VPMOVSXBDrr	4914
+VPMOVSXBQYrm	4915
+VPMOVSXBQYrr	4916
+VPMOVSXBQZ	4917
+VPMOVSXBQZrm	4918
+VPMOVSXBQZrmk	4919
+VPMOVSXBQZrmkz	4920
+VPMOVSXBQZrr	4921
+VPMOVSXBQZrrk	4922
+VPMOVSXBQZrrkz	4923
+VPMOVSXBQrm	4924
+VPMOVSXBQrr	4925
+VPMOVSXBWYrm	4926
+VPMOVSXBWYrr	4927
+VPMOVSXBWZ	4928
+VPMOVSXBWZrm	4929
+VPMOVSXBWZrmk	4930
+VPMOVSXBWZrmkz	4931
+VPMOVSXBWZrr	4932
+VPMOVSXBWZrrk	4933
+VPMOVSXBWZrrkz	4934
+VPMOVSXBWrm	4935
+VPMOVSXBWrr	4936
+VPMOVSXDQYrm	4937
+VPMOVSXDQYrr	4938
+VPMOVSXDQZ	4939
+VPMOVSXDQZrm	4940
+VPMOVSXDQZrmk	4941
+VPMOVSXDQZrmkz	4942
+VPMOVSXDQZrr	4943
+VPMOVSXDQZrrk	4944
+VPMOVSXDQZrrkz	4945
+VPMOVSXDQrm	4946
+VPMOVSXDQrr	4947
+VPMOVSXWDYrm	4948
+VPMOVSXWDYrr	4949
+VPMOVSXWDZ	4950
+VPMOVSXWDZrm	4951
+VPMOVSXWDZrmk	4952
+VPMOVSXWDZrmkz	4953
+VPMOVSXWDZrr	4954
+VPMOVSXWDZrrk	4955
+VPMOVSXWDZrrkz	4956
+VPMOVSXWDrm	4957
+VPMOVSXWDrr	4958
+VPMOVSXWQYrm	4959
+VPMOVSXWQYrr	4960
+VPMOVSXWQZ	4961
+VPMOVSXWQZrm	4962
+VPMOVSXWQZrmk	4963
+VPMOVSXWQZrmkz	4964
+VPMOVSXWQZrr	4965
+VPMOVSXWQZrrk	4966
+VPMOVSXWQZrrkz	4967
+VPMOVSXWQrm	4968
+VPMOVSXWQrr	4969
+VPMOVUSDBZ	4970
+VPMOVUSDBZmr	4971
+VPMOVUSDBZmrk	4972
+VPMOVUSDBZrr	4973
+VPMOVUSDBZrrk	4974
+VPMOVUSDBZrrkz	4975
+VPMOVUSDWZ	4976
+VPMOVUSDWZmr	4977
+VPMOVUSDWZmrk	4978
+VPMOVUSDWZrr	4979
+VPMOVUSDWZrrk	4980
+VPMOVUSDWZrrkz	4981
+VPMOVUSQBZ	4982
+VPMOVUSQBZmr	4983
+VPMOVUSQBZmrk	4984
+VPMOVUSQBZrr	4985
+VPMOVUSQBZrrk	4986
+VPMOVUSQBZrrkz	4987
+VPMOVUSQDZ	4988
+VPMOVUSQDZmr	4989
+VPMOVUSQDZmrk	4990
+VPMOVUSQDZrr	4991
+VPMOVUSQDZrrk	4992
+VPMOVUSQDZrrkz	4993
+VPMOVUSQWZ	4994
+VPMOVUSQWZmr	4995
+VPMOVUSQWZmrk	4996
+VPMOVUSQWZrr	4997
+VPMOVUSQWZrrk	4998
+VPMOVUSQWZrrkz	4999
+VPMOVUSWBZ	5000
+VPMOVUSWBZmr	5001
+VPMOVUSWBZmrk	5002
+VPMOVUSWBZrr	5003
+VPMOVUSWBZrrk	5004
+VPMOVUSWBZrrkz	5005
+VPMOVW	5006
+VPMOVWBZ	5007
+VPMOVWBZmr	5008
+VPMOVWBZmrk	5009
+VPMOVWBZrr	5010
+VPMOVWBZrrk	5011
+VPMOVWBZrrkz	5012
+VPMOVZXBDYrm	5013
+VPMOVZXBDYrr	5014
+VPMOVZXBDZ	5015
+VPMOVZXBDZrm	5016
+VPMOVZXBDZrmk	5017
+VPMOVZXBDZrmkz	5018
+VPMOVZXBDZrr	5019
+VPMOVZXBDZrrk	5020
+VPMOVZXBDZrrkz	5021
+VPMOVZXBDrm	5022
+VPMOVZXBDrr	5023
+VPMOVZXBQYrm	5024
+VPMOVZXBQYrr	5025
+VPMOVZXBQZ	5026
+VPMOVZXBQZrm	5027
+VPMOVZXBQZrmk	5028
+VPMOVZXBQZrmkz	5029
+VPMOVZXBQZrr	5030
+VPMOVZXBQZrrk	5031
+VPMOVZXBQZrrkz	5032
+VPMOVZXBQrm	5033
+VPMOVZXBQrr	5034
+VPMOVZXBWYrm	5035
+VPMOVZXBWYrr	5036
+VPMOVZXBWZ	5037
+VPMOVZXBWZrm	5038
+VPMOVZXBWZrmk	5039
+VPMOVZXBWZrmkz	5040
+VPMOVZXBWZrr	5041
+VPMOVZXBWZrrk	5042
+VPMOVZXBWZrrkz	5043
+VPMOVZXBWrm	5044
+VPMOVZXBWrr	5045
+VPMOVZXDQYrm	5046
+VPMOVZXDQYrr	5047
+VPMOVZXDQZ	5048
+VPMOVZXDQZrm	5049
+VPMOVZXDQZrmk	5050
+VPMOVZXDQZrmkz	5051
+VPMOVZXDQZrr	5052
+VPMOVZXDQZrrk	5053
+VPMOVZXDQZrrkz	5054
+VPMOVZXDQrm	5055
+VPMOVZXDQrr	5056
+VPMOVZXWDYrm	5057
+VPMOVZXWDYrr	5058
+VPMOVZXWDZ	5059
+VPMOVZXWDZrm	5060
+VPMOVZXWDZrmk	5061
+VPMOVZXWDZrmkz	5062
+VPMOVZXWDZrr	5063
+VPMOVZXWDZrrk	5064
+VPMOVZXWDZrrkz	5065
+VPMOVZXWDrm	5066
+VPMOVZXWDrr	5067
+VPMOVZXWQYrm	5068
+VPMOVZXWQYrr	5069
+VPMOVZXWQZ	5070
+VPMOVZXWQZrm	5071
+VPMOVZXWQZrmk	5072
+VPMOVZXWQZrmkz	5073
+VPMOVZXWQZrr	5074
+VPMOVZXWQZrrk	5075
+VPMOVZXWQZrrkz	5076
+VPMOVZXWQrm	5077
+VPMOVZXWQrr	5078
+VPMULDQYrm	5079
+VPMULDQYrr	5080
+VPMULDQZ	5081
+VPMULDQZrm	5082
+VPMULDQZrmb	5083
+VPMULDQZrmbk	5084
+VPMULDQZrmbkz	5085
+VPMULDQZrmk	5086
+VPMULDQZrmkz	5087
+VPMULDQZrr	5088
+VPMULDQZrrk	5089
+VPMULDQZrrkz	5090
+VPMULDQrm	5091
+VPMULDQrr	5092
+VPMULHRSWYrm	5093
+VPMULHRSWYrr	5094
+VPMULHRSWZ	5095
+VPMULHRSWZrm	5096
+VPMULHRSWZrmk	5097
+VPMULHRSWZrmkz	5098
+VPMULHRSWZrr	5099
+VPMULHRSWZrrk	5100
+VPMULHRSWZrrkz	5101
+VPMULHRSWrm	5102
+VPMULHRSWrr	5103
+VPMULHUWYrm	5104
+VPMULHUWYrr	5105
+VPMULHUWZ	5106
+VPMULHUWZrm	5107
+VPMULHUWZrmk	5108
+VPMULHUWZrmkz	5109
+VPMULHUWZrr	5110
+VPMULHUWZrrk	5111
+VPMULHUWZrrkz	5112
+VPMULHUWrm	5113
+VPMULHUWrr	5114
+VPMULHWYrm	5115
+VPMULHWYrr	5116
+VPMULHWZ	5117
+VPMULHWZrm	5118
+VPMULHWZrmk	5119
+VPMULHWZrmkz	5120
+VPMULHWZrr	5121
+VPMULHWZrrk	5122
+VPMULHWZrrkz	5123
+VPMULHWrm	5124
+VPMULHWrr	5125
+VPMULLDYrm	5126
+VPMULLDYrr	5127
+VPMULLDZ	5128
+VPMULLDZrm	5129
+VPMULLDZrmb	5130
+VPMULLDZrmbk	5131
+VPMULLDZrmbkz	5132
+VPMULLDZrmk	5133
+VPMULLDZrmkz	5134
+VPMULLDZrr	5135
+VPMULLDZrrk	5136
+VPMULLDZrrkz	5137
+VPMULLDrm	5138
+VPMULLDrr	5139
+VPMULLQZ	5140
+VPMULLQZrm	5141
+VPMULLQZrmb	5142
+VPMULLQZrmbk	5143
+VPMULLQZrmbkz	5144
+VPMULLQZrmk	5145
+VPMULLQZrmkz	5146
+VPMULLQZrr	5147
+VPMULLQZrrk	5148
+VPMULLQZrrkz	5149
+VPMULLWYrm	5150
+VPMULLWYrr	5151
+VPMULLWZ	5152
+VPMULLWZrm	5153
+VPMULLWZrmk	5154
+VPMULLWZrmkz	5155
+VPMULLWZrr	5156
+VPMULLWZrrk	5157
+VPMULLWZrrkz	5158
+VPMULLWrm	5159
+VPMULLWrr	5160
+VPMULTISHIFTQBZ	5161
+VPMULTISHIFTQBZrm	5162
+VPMULTISHIFTQBZrmb	5163
+VPMULTISHIFTQBZrmbk	5164
+VPMULTISHIFTQBZrmbkz	5165
+VPMULTISHIFTQBZrmk	5166
+VPMULTISHIFTQBZrmkz	5167
+VPMULTISHIFTQBZrr	5168
+VPMULTISHIFTQBZrrk	5169
+VPMULTISHIFTQBZrrkz	5170
+VPMULUDQYrm	5171
+VPMULUDQYrr	5172
+VPMULUDQZ	5173
+VPMULUDQZrm	5174
+VPMULUDQZrmb	5175
+VPMULUDQZrmbk	5176
+VPMULUDQZrmbkz	5177
+VPMULUDQZrmk	5178
+VPMULUDQZrmkz	5179
+VPMULUDQZrr	5180
+VPMULUDQZrrk	5181
+VPMULUDQZrrkz	5182
+VPMULUDQrm	5183
+VPMULUDQrr	5184
+VPOPCNTBZ	5185
+VPOPCNTBZrm	5186
+VPOPCNTBZrmk	5187
+VPOPCNTBZrmkz	5188
+VPOPCNTBZrr	5189
+VPOPCNTBZrrk	5190
+VPOPCNTBZrrkz	5191
+VPOPCNTDZ	5192
+VPOPCNTDZrm	5193
+VPOPCNTDZrmb	5194
+VPOPCNTDZrmbk	5195
+VPOPCNTDZrmbkz	5196
+VPOPCNTDZrmk	5197
+VPOPCNTDZrmkz	5198
+VPOPCNTDZrr	5199
+VPOPCNTDZrrk	5200
+VPOPCNTDZrrkz	5201
+VPOPCNTQZ	5202
+VPOPCNTQZrm	5203
+VPOPCNTQZrmb	5204
+VPOPCNTQZrmbk	5205
+VPOPCNTQZrmbkz	5206
+VPOPCNTQZrmk	5207
+VPOPCNTQZrmkz	5208
+VPOPCNTQZrr	5209
+VPOPCNTQZrrk	5210
+VPOPCNTQZrrkz	5211
+VPOPCNTWZ	5212
+VPOPCNTWZrm	5213
+VPOPCNTWZrmk	5214
+VPOPCNTWZrmkz	5215
+VPOPCNTWZrr	5216
+VPOPCNTWZrrk	5217
+VPOPCNTWZrrkz	5218
+VPORDZ	5219
+VPORDZrm	5220
+VPORDZrmb	5221
+VPORDZrmbk	5222
+VPORDZrmbkz	5223
+VPORDZrmk	5224
+VPORDZrmkz	5225
+VPORDZrr	5226
+VPORDZrrk	5227
+VPORDZrrkz	5228
+VPORQZ	5229
+VPORQZrm	5230
+VPORQZrmb	5231
+VPORQZrmbk	5232
+VPORQZrmbkz	5233
+VPORQZrmk	5234
+VPORQZrmkz	5235
+VPORQZrr	5236
+VPORQZrrk	5237
+VPORQZrrkz	5238
+VPORYrm	5239
+VPORYrr	5240
+VPORrm	5241
+VPORrr	5242
+VPPERMrmr	5243
+VPPERMrrm	5244
+VPPERMrrr	5245
+VPPERMrrr_REV	5246
+VPROLDZ	5247
+VPROLDZmbi	5248
+VPROLDZmbik	5249
+VPROLDZmbikz	5250
+VPROLDZmi	5251
+VPROLDZmik	5252
+VPROLDZmikz	5253
+VPROLDZri	5254
+VPROLDZrik	5255
+VPROLDZrikz	5256
+VPROLQZ	5257
+VPROLQZmbi	5258
+VPROLQZmbik	5259
+VPROLQZmbikz	5260
+VPROLQZmi	5261
+VPROLQZmik	5262
+VPROLQZmikz	5263
+VPROLQZri	5264
+VPROLQZrik	5265
+VPROLQZrikz	5266
+VPROLVDZ	5267
+VPROLVDZrm	5268
+VPROLVDZrmb	5269
+VPROLVDZrmbk	5270
+VPROLVDZrmbkz	5271
+VPROLVDZrmk	5272
+VPROLVDZrmkz	5273
+VPROLVDZrr	5274
+VPROLVDZrrk	5275
+VPROLVDZrrkz	5276
+VPROLVQZ	5277
+VPROLVQZrm	5278
+VPROLVQZrmb	5279
+VPROLVQZrmbk	5280
+VPROLVQZrmbkz	5281
+VPROLVQZrmk	5282
+VPROLVQZrmkz	5283
+VPROLVQZrr	5284
+VPROLVQZrrk	5285
+VPROLVQZrrkz	5286
+VPRORDZ	5287
+VPRORDZmbi	5288
+VPRORDZmbik	5289
+VPRORDZmbikz	5290
+VPRORDZmi	5291
+VPRORDZmik	5292
+VPRORDZmikz	5293
+VPRORDZri	5294
+VPRORDZrik	5295
+VPRORDZrikz	5296
+VPRORQZ	5297
+VPRORQZmbi	5298
+VPRORQZmbik	5299
+VPRORQZmbikz	5300
+VPRORQZmi	5301
+VPRORQZmik	5302
+VPRORQZmikz	5303
+VPRORQZri	5304
+VPRORQZrik	5305
+VPRORQZrikz	5306
+VPRORVDZ	5307
+VPRORVDZrm	5308
+VPRORVDZrmb	5309
+VPRORVDZrmbk	5310
+VPRORVDZrmbkz	5311
+VPRORVDZrmk	5312
+VPRORVDZrmkz	5313
+VPRORVDZrr	5314
+VPRORVDZrrk	5315
+VPRORVDZrrkz	5316
+VPRORVQZ	5317
+VPRORVQZrm	5318
+VPRORVQZrmb	5319
+VPRORVQZrmbk	5320
+VPRORVQZrmbkz	5321
+VPRORVQZrmk	5322
+VPRORVQZrmkz	5323
+VPRORVQZrr	5324
+VPRORVQZrrk	5325
+VPRORVQZrrkz	5326
+VPROTBmi	5327
+VPROTBmr	5328
+VPROTBri	5329
+VPROTBrm	5330
+VPROTBrr	5331
+VPROTBrr_REV	5332
+VPROTDmi	5333
+VPROTDmr	5334
+VPROTDri	5335
+VPROTDrm	5336
+VPROTDrr	5337
+VPROTDrr_REV	5338
+VPROTQmi	5339
+VPROTQmr	5340
+VPROTQri	5341
+VPROTQrm	5342
+VPROTQrr	5343
+VPROTQrr_REV	5344
+VPROTWmi	5345
+VPROTWmr	5346
+VPROTWri	5347
+VPROTWrm	5348
+VPROTWrr	5349
+VPROTWrr_REV	5350
+VPSADBWYrm	5351
+VPSADBWYrr	5352
+VPSADBWZ	5353
+VPSADBWZrm	5354
+VPSADBWZrr	5355
+VPSADBWrm	5356
+VPSADBWrr	5357
+VPSCATTERDDZ	5358
+VPSCATTERDDZmr	5359
+VPSCATTERDQZ	5360
+VPSCATTERDQZmr	5361
+VPSCATTERQDZ	5362
+VPSCATTERQDZmr	5363
+VPSCATTERQQZ	5364
+VPSCATTERQQZmr	5365
+VPSHABmr	5366
+VPSHABrm	5367
+VPSHABrr	5368
+VPSHABrr_REV	5369
+VPSHADmr	5370
+VPSHADrm	5371
+VPSHADrr	5372
+VPSHADrr_REV	5373
+VPSHAQmr	5374
+VPSHAQrm	5375
+VPSHAQrr	5376
+VPSHAQrr_REV	5377
+VPSHAWmr	5378
+VPSHAWrm	5379
+VPSHAWrr	5380
+VPSHAWrr_REV	5381
+VPSHLBmr	5382
+VPSHLBrm	5383
+VPSHLBrr	5384
+VPSHLBrr_REV	5385
+VPSHLDDZ	5386
+VPSHLDDZrmbi	5387
+VPSHLDDZrmbik	5388
+VPSHLDDZrmbikz	5389
+VPSHLDDZrmi	5390
+VPSHLDDZrmik	5391
+VPSHLDDZrmikz	5392
+VPSHLDDZrri	5393
+VPSHLDDZrrik	5394
+VPSHLDDZrrikz	5395
+VPSHLDQZ	5396
+VPSHLDQZrmbi	5397
+VPSHLDQZrmbik	5398
+VPSHLDQZrmbikz	5399
+VPSHLDQZrmi	5400
+VPSHLDQZrmik	5401
+VPSHLDQZrmikz	5402
+VPSHLDQZrri	5403
+VPSHLDQZrrik	5404
+VPSHLDQZrrikz	5405
+VPSHLDVDZ	5406
+VPSHLDVDZm	5407
+VPSHLDVDZmb	5408
+VPSHLDVDZmbk	5409
+VPSHLDVDZmbkz	5410
+VPSHLDVDZmk	5411
+VPSHLDVDZmkz	5412
+VPSHLDVDZr	5413
+VPSHLDVDZrk	5414
+VPSHLDVDZrkz	5415
+VPSHLDVQZ	5416
+VPSHLDVQZm	5417
+VPSHLDVQZmb	5418
+VPSHLDVQZmbk	5419
+VPSHLDVQZmbkz	5420
+VPSHLDVQZmk	5421
+VPSHLDVQZmkz	5422
+VPSHLDVQZr	5423
+VPSHLDVQZrk	5424
+VPSHLDVQZrkz	5425
+VPSHLDVWZ	5426
+VPSHLDVWZm	5427
+VPSHLDVWZmk	5428
+VPSHLDVWZmkz	5429
+VPSHLDVWZr	5430
+VPSHLDVWZrk	5431
+VPSHLDVWZrkz	5432
+VPSHLDWZ	5433
+VPSHLDWZrmi	5434
+VPSHLDWZrmik	5435
+VPSHLDWZrmikz	5436
+VPSHLDWZrri	5437
+VPSHLDWZrrik	5438
+VPSHLDWZrrikz	5439
+VPSHLDmr	5440
+VPSHLDrm	5441
+VPSHLDrr	5442
+VPSHLDrr_REV	5443
+VPSHLQmr	5444
+VPSHLQrm	5445
+VPSHLQrr	5446
+VPSHLQrr_REV	5447
+VPSHLWmr	5448
+VPSHLWrm	5449
+VPSHLWrr	5450
+VPSHLWrr_REV	5451
+VPSHRDDZ	5452
+VPSHRDDZrmbi	5453
+VPSHRDDZrmbik	5454
+VPSHRDDZrmbikz	5455
+VPSHRDDZrmi	5456
+VPSHRDDZrmik	5457
+VPSHRDDZrmikz	5458
+VPSHRDDZrri	5459
+VPSHRDDZrrik	5460
+VPSHRDDZrrikz	5461
+VPSHRDQZ	5462
+VPSHRDQZrmbi	5463
+VPSHRDQZrmbik	5464
+VPSHRDQZrmbikz	5465
+VPSHRDQZrmi	5466
+VPSHRDQZrmik	5467
+VPSHRDQZrmikz	5468
+VPSHRDQZrri	5469
+VPSHRDQZrrik	5470
+VPSHRDQZrrikz	5471
+VPSHRDVDZ	5472
+VPSHRDVDZm	5473
+VPSHRDVDZmb	5474
+VPSHRDVDZmbk	5475
+VPSHRDVDZmbkz	5476
+VPSHRDVDZmk	5477
+VPSHRDVDZmkz	5478
+VPSHRDVDZr	5479
+VPSHRDVDZrk	5480
+VPSHRDVDZrkz	5481
+VPSHRDVQZ	5482
+VPSHRDVQZm	5483
+VPSHRDVQZmb	5484
+VPSHRDVQZmbk	5485
+VPSHRDVQZmbkz	5486
+VPSHRDVQZmk	5487
+VPSHRDVQZmkz	5488
+VPSHRDVQZr	5489
+VPSHRDVQZrk	5490
+VPSHRDVQZrkz	5491
+VPSHRDVWZ	5492
+VPSHRDVWZm	5493
+VPSHRDVWZmk	5494
+VPSHRDVWZmkz	5495
+VPSHRDVWZr	5496
+VPSHRDVWZrk	5497
+VPSHRDVWZrkz	5498
+VPSHRDWZ	5499
+VPSHRDWZrmi	5500
+VPSHRDWZrmik	5501
+VPSHRDWZrmikz	5502
+VPSHRDWZrri	5503
+VPSHRDWZrrik	5504
+VPSHRDWZrrikz	5505
+VPSHUFBITQMBZ	5506
+VPSHUFBITQMBZrm	5507
+VPSHUFBITQMBZrmk	5508
+VPSHUFBITQMBZrr	5509
+VPSHUFBITQMBZrrk	5510
+VPSHUFBYrm	5511
+VPSHUFBYrr	5512
+VPSHUFBZ	5513
+VPSHUFBZrm	5514
+VPSHUFBZrmk	5515
+VPSHUFBZrmkz	5516
+VPSHUFBZrr	5517
+VPSHUFBZrrk	5518
+VPSHUFBZrrkz	5519
+VPSHUFBrm	5520
+VPSHUFBrr	5521
+VPSHUFDYmi	5522
+VPSHUFDYri	5523
+VPSHUFDZ	5524
+VPSHUFDZmbi	5525
+VPSHUFDZmbik	5526
+VPSHUFDZmbikz	5527
+VPSHUFDZmi	5528
+VPSHUFDZmik	5529
+VPSHUFDZmikz	5530
+VPSHUFDZri	5531
+VPSHUFDZrik	5532
+VPSHUFDZrikz	5533
+VPSHUFDmi	5534
+VPSHUFDri	5535
+VPSHUFHWYmi	5536
+VPSHUFHWYri	5537
+VPSHUFHWZ	5538
+VPSHUFHWZmi	5539
+VPSHUFHWZmik	5540
+VPSHUFHWZmikz	5541
+VPSHUFHWZri	5542
+VPSHUFHWZrik	5543
+VPSHUFHWZrikz	5544
+VPSHUFHWmi	5545
+VPSHUFHWri	5546
+VPSHUFLWYmi	5547
+VPSHUFLWYri	5548
+VPSHUFLWZ	5549
+VPSHUFLWZmi	5550
+VPSHUFLWZmik	5551
+VPSHUFLWZmikz	5552
+VPSHUFLWZri	5553
+VPSHUFLWZrik	5554
+VPSHUFLWZrikz	5555
+VPSHUFLWmi	5556
+VPSHUFLWri	5557
+VPSIGNBYrm	5558
+VPSIGNBYrr	5559
+VPSIGNBrm	5560
+VPSIGNBrr	5561
+VPSIGNDYrm	5562
+VPSIGNDYrr	5563
+VPSIGNDrm	5564
+VPSIGNDrr	5565
+VPSIGNWYrm	5566
+VPSIGNWYrr	5567
+VPSIGNWrm	5568
+VPSIGNWrr	5569
+VPSLLDQYri	5570
+VPSLLDQZ	5571
+VPSLLDQZmi	5572
+VPSLLDQZri	5573
+VPSLLDQri	5574
+VPSLLDYri	5575
+VPSLLDYrm	5576
+VPSLLDYrr	5577
+VPSLLDZ	5578
+VPSLLDZmbi	5579
+VPSLLDZmbik	5580
+VPSLLDZmbikz	5581
+VPSLLDZmi	5582
+VPSLLDZmik	5583
+VPSLLDZmikz	5584
+VPSLLDZri	5585
+VPSLLDZrik	5586
+VPSLLDZrikz	5587
+VPSLLDZrm	5588
+VPSLLDZrmk	5589
+VPSLLDZrmkz	5590
+VPSLLDZrr	5591
+VPSLLDZrrk	5592
+VPSLLDZrrkz	5593
+VPSLLDri	5594
+VPSLLDrm	5595
+VPSLLDrr	5596
+VPSLLQYri	5597
+VPSLLQYrm	5598
+VPSLLQYrr	5599
+VPSLLQZ	5600
+VPSLLQZmbi	5601
+VPSLLQZmbik	5602
+VPSLLQZmbikz	5603
+VPSLLQZmi	5604
+VPSLLQZmik	5605
+VPSLLQZmikz	5606
+VPSLLQZri	5607
+VPSLLQZrik	5608
+VPSLLQZrikz	5609
+VPSLLQZrm	5610
+VPSLLQZrmk	5611
+VPSLLQZrmkz	5612
+VPSLLQZrr	5613
+VPSLLQZrrk	5614
+VPSLLQZrrkz	5615
+VPSLLQri	5616
+VPSLLQrm	5617
+VPSLLQrr	5618
+VPSLLVDYrm	5619
+VPSLLVDYrr	5620
+VPSLLVDZ	5621
+VPSLLVDZrm	5622
+VPSLLVDZrmb	5623
+VPSLLVDZrmbk	5624
+VPSLLVDZrmbkz	5625
+VPSLLVDZrmk	5626
+VPSLLVDZrmkz	5627
+VPSLLVDZrr	5628
+VPSLLVDZrrk	5629
+VPSLLVDZrrkz	5630
+VPSLLVDrm	5631
+VPSLLVDrr	5632
+VPSLLVQYrm	5633
+VPSLLVQYrr	5634
+VPSLLVQZ	5635
+VPSLLVQZrm	5636
+VPSLLVQZrmb	5637
+VPSLLVQZrmbk	5638
+VPSLLVQZrmbkz	5639
+VPSLLVQZrmk	5640
+VPSLLVQZrmkz	5641
+VPSLLVQZrr	5642
+VPSLLVQZrrk	5643
+VPSLLVQZrrkz	5644
+VPSLLVQrm	5645
+VPSLLVQrr	5646
+VPSLLVWZ	5647
+VPSLLVWZrm	5648
+VPSLLVWZrmk	5649
+VPSLLVWZrmkz	5650
+VPSLLVWZrr	5651
+VPSLLVWZrrk	5652
+VPSLLVWZrrkz	5653
+VPSLLWYri	5654
+VPSLLWYrm	5655
+VPSLLWYrr	5656
+VPSLLWZ	5657
+VPSLLWZmi	5658
+VPSLLWZmik	5659
+VPSLLWZmikz	5660
+VPSLLWZri	5661
+VPSLLWZrik	5662
+VPSLLWZrikz	5663
+VPSLLWZrm	5664
+VPSLLWZrmk	5665
+VPSLLWZrmkz	5666
+VPSLLWZrr	5667
+VPSLLWZrrk	5668
+VPSLLWZrrkz	5669
+VPSLLWri	5670
+VPSLLWrm	5671
+VPSLLWrr	5672
+VPSRADYri	5673
+VPSRADYrm	5674
+VPSRADYrr	5675
+VPSRADZ	5676
+VPSRADZmbi	5677
+VPSRADZmbik	5678
+VPSRADZmbikz	5679
+VPSRADZmi	5680
+VPSRADZmik	5681
+VPSRADZmikz	5682
+VPSRADZri	5683
+VPSRADZrik	5684
+VPSRADZrikz	5685
+VPSRADZrm	5686
+VPSRADZrmk	5687
+VPSRADZrmkz	5688
+VPSRADZrr	5689
+VPSRADZrrk	5690
+VPSRADZrrkz	5691
+VPSRADri	5692
+VPSRADrm	5693
+VPSRADrr	5694
+VPSRAQZ	5695
+VPSRAQZmbi	5696
+VPSRAQZmbik	5697
+VPSRAQZmbikz	5698
+VPSRAQZmi	5699
+VPSRAQZmik	5700
+VPSRAQZmikz	5701
+VPSRAQZri	5702
+VPSRAQZrik	5703
+VPSRAQZrikz	5704
+VPSRAQZrm	5705
+VPSRAQZrmk	5706
+VPSRAQZrmkz	5707
+VPSRAQZrr	5708
+VPSRAQZrrk	5709
+VPSRAQZrrkz	5710
+VPSRAVDYrm	5711
+VPSRAVDYrr	5712
+VPSRAVDZ	5713
+VPSRAVDZrm	5714
+VPSRAVDZrmb	5715
+VPSRAVDZrmbk	5716
+VPSRAVDZrmbkz	5717
+VPSRAVDZrmk	5718
+VPSRAVDZrmkz	5719
+VPSRAVDZrr	5720
+VPSRAVDZrrk	5721
+VPSRAVDZrrkz	5722
+VPSRAVDrm	5723
+VPSRAVDrr	5724
+VPSRAVQZ	5725
+VPSRAVQZrm	5726
+VPSRAVQZrmb	5727
+VPSRAVQZrmbk	5728
+VPSRAVQZrmbkz	5729
+VPSRAVQZrmk	5730
+VPSRAVQZrmkz	5731
+VPSRAVQZrr	5732
+VPSRAVQZrrk	5733
+VPSRAVQZrrkz	5734
+VPSRAVWZ	5735
+VPSRAVWZrm	5736
+VPSRAVWZrmk	5737
+VPSRAVWZrmkz	5738
+VPSRAVWZrr	5739
+VPSRAVWZrrk	5740
+VPSRAVWZrrkz	5741
+VPSRAWYri	5742
+VPSRAWYrm	5743
+VPSRAWYrr	5744
+VPSRAWZ	5745
+VPSRAWZmi	5746
+VPSRAWZmik	5747
+VPSRAWZmikz	5748
+VPSRAWZri	5749
+VPSRAWZrik	5750
+VPSRAWZrikz	5751
+VPSRAWZrm	5752
+VPSRAWZrmk	5753
+VPSRAWZrmkz	5754
+VPSRAWZrr	5755
+VPSRAWZrrk	5756
+VPSRAWZrrkz	5757
+VPSRAWri	5758
+VPSRAWrm	5759
+VPSRAWrr	5760
+VPSRLDQYri	5761
+VPSRLDQZ	5762
+VPSRLDQZmi	5763
+VPSRLDQZri	5764
+VPSRLDQri	5765
+VPSRLDYri	5766
+VPSRLDYrm	5767
+VPSRLDYrr	5768
+VPSRLDZ	5769
+VPSRLDZmbi	5770
+VPSRLDZmbik	5771
+VPSRLDZmbikz	5772
+VPSRLDZmi	5773
+VPSRLDZmik	5774
+VPSRLDZmikz	5775
+VPSRLDZri	5776
+VPSRLDZrik	5777
+VPSRLDZrikz	5778
+VPSRLDZrm	5779
+VPSRLDZrmk	5780
+VPSRLDZrmkz	5781
+VPSRLDZrr	5782
+VPSRLDZrrk	5783
+VPSRLDZrrkz	5784
+VPSRLDri	5785
+VPSRLDrm	5786
+VPSRLDrr	5787
+VPSRLQYri	5788
+VPSRLQYrm	5789
+VPSRLQYrr	5790
+VPSRLQZ	5791
+VPSRLQZmbi	5792
+VPSRLQZmbik	5793
+VPSRLQZmbikz	5794
+VPSRLQZmi	5795
+VPSRLQZmik	5796
+VPSRLQZmikz	5797
+VPSRLQZri	5798
+VPSRLQZrik	5799
+VPSRLQZrikz	5800
+VPSRLQZrm	5801
+VPSRLQZrmk	5802
+VPSRLQZrmkz	5803
+VPSRLQZrr	5804
+VPSRLQZrrk	5805
+VPSRLQZrrkz	5806
+VPSRLQri	5807
+VPSRLQrm	5808
+VPSRLQrr	5809
+VPSRLVDYrm	5810
+VPSRLVDYrr	5811
+VPSRLVDZ	5812
+VPSRLVDZrm	5813
+VPSRLVDZrmb	5814
+VPSRLVDZrmbk	5815
+VPSRLVDZrmbkz	5816
+VPSRLVDZrmk	5817
+VPSRLVDZrmkz	5818
+VPSRLVDZrr	5819
+VPSRLVDZrrk	5820
+VPSRLVDZrrkz	5821
+VPSRLVDrm	5822
+VPSRLVDrr	5823
+VPSRLVQYrm	5824
+VPSRLVQYrr	5825
+VPSRLVQZ	5826
+VPSRLVQZrm	5827
+VPSRLVQZrmb	5828
+VPSRLVQZrmbk	5829
+VPSRLVQZrmbkz	5830
+VPSRLVQZrmk	5831
+VPSRLVQZrmkz	5832
+VPSRLVQZrr	5833
+VPSRLVQZrrk	5834
+VPSRLVQZrrkz	5835
+VPSRLVQrm	5836
+VPSRLVQrr	5837
+VPSRLVWZ	5838
+VPSRLVWZrm	5839
+VPSRLVWZrmk	5840
+VPSRLVWZrmkz	5841
+VPSRLVWZrr	5842
+VPSRLVWZrrk	5843
+VPSRLVWZrrkz	5844
+VPSRLWYri	5845
+VPSRLWYrm	5846
+VPSRLWYrr	5847
+VPSRLWZ	5848
+VPSRLWZmi	5849
+VPSRLWZmik	5850
+VPSRLWZmikz	5851
+VPSRLWZri	5852
+VPSRLWZrik	5853
+VPSRLWZrikz	5854
+VPSRLWZrm	5855
+VPSRLWZrmk	5856
+VPSRLWZrmkz	5857
+VPSRLWZrr	5858
+VPSRLWZrrk	5859
+VPSRLWZrrkz	5860
+VPSRLWri	5861
+VPSRLWrm	5862
+VPSRLWrr	5863
+VPSUBBYrm	5864
+VPSUBBYrr	5865
+VPSUBBZ	5866
+VPSUBBZrm	5867
+VPSUBBZrmk	5868
+VPSUBBZrmkz	5869
+VPSUBBZrr	5870
+VPSUBBZrrk	5871
+VPSUBBZrrkz	5872
+VPSUBBrm	5873
+VPSUBBrr	5874
+VPSUBDYrm	5875
+VPSUBDYrr	5876
+VPSUBDZ	5877
+VPSUBDZrm	5878
+VPSUBDZrmb	5879
+VPSUBDZrmbk	5880
+VPSUBDZrmbkz	5881
+VPSUBDZrmk	5882
+VPSUBDZrmkz	5883
+VPSUBDZrr	5884
+VPSUBDZrrk	5885
+VPSUBDZrrkz	5886
+VPSUBDrm	5887
+VPSUBDrr	5888
+VPSUBQYrm	5889
+VPSUBQYrr	5890
+VPSUBQZ	5891
+VPSUBQZrm	5892
+VPSUBQZrmb	5893
+VPSUBQZrmbk	5894
+VPSUBQZrmbkz	5895
+VPSUBQZrmk	5896
+VPSUBQZrmkz	5897
+VPSUBQZrr	5898
+VPSUBQZrrk	5899
+VPSUBQZrrkz	5900
+VPSUBQrm	5901
+VPSUBQrr	5902
+VPSUBSBYrm	5903
+VPSUBSBYrr	5904
+VPSUBSBZ	5905
+VPSUBSBZrm	5906
+VPSUBSBZrmk	5907
+VPSUBSBZrmkz	5908
+VPSUBSBZrr	5909
+VPSUBSBZrrk	5910
+VPSUBSBZrrkz	5911
+VPSUBSBrm	5912
+VPSUBSBrr	5913
+VPSUBSWYrm	5914
+VPSUBSWYrr	5915
+VPSUBSWZ	5916
+VPSUBSWZrm	5917
+VPSUBSWZrmk	5918
+VPSUBSWZrmkz	5919
+VPSUBSWZrr	5920
+VPSUBSWZrrk	5921
+VPSUBSWZrrkz	5922
+VPSUBSWrm	5923
+VPSUBSWrr	5924
+VPSUBUSBYrm	5925
+VPSUBUSBYrr	5926
+VPSUBUSBZ	5927
+VPSUBUSBZrm	5928
+VPSUBUSBZrmk	5929
+VPSUBUSBZrmkz	5930
+VPSUBUSBZrr	5931
+VPSUBUSBZrrk	5932
+VPSUBUSBZrrkz	5933
+VPSUBUSBrm	5934
+VPSUBUSBrr	5935
+VPSUBUSWYrm	5936
+VPSUBUSWYrr	5937
+VPSUBUSWZ	5938
+VPSUBUSWZrm	5939
+VPSUBUSWZrmk	5940
+VPSUBUSWZrmkz	5941
+VPSUBUSWZrr	5942
+VPSUBUSWZrrk	5943
+VPSUBUSWZrrkz	5944
+VPSUBUSWrm	5945
+VPSUBUSWrr	5946
+VPSUBWYrm	5947
+VPSUBWYrr	5948
+VPSUBWZ	5949
+VPSUBWZrm	5950
+VPSUBWZrmk	5951
+VPSUBWZrmkz	5952
+VPSUBWZrr	5953
+VPSUBWZrrk	5954
+VPSUBWZrrkz	5955
+VPSUBWrm	5956
+VPSUBWrr	5957
+VPTERNLOGDZ	5958
+VPTERNLOGDZrmbi	5959
+VPTERNLOGDZrmbik	5960
+VPTERNLOGDZrmbikz	5961
+VPTERNLOGDZrmi	5962
+VPTERNLOGDZrmik	5963
+VPTERNLOGDZrmikz	5964
+VPTERNLOGDZrri	5965
+VPTERNLOGDZrrik	5966
+VPTERNLOGDZrrikz	5967
+VPTERNLOGQZ	5968
+VPTERNLOGQZrmbi	5969
+VPTERNLOGQZrmbik	5970
+VPTERNLOGQZrmbikz	5971
+VPTERNLOGQZrmi	5972
+VPTERNLOGQZrmik	5973
+VPTERNLOGQZrmikz	5974
+VPTERNLOGQZrri	5975
+VPTERNLOGQZrrik	5976
+VPTERNLOGQZrrikz	5977
+VPTESTMBZ	5978
+VPTESTMBZrm	5979
+VPTESTMBZrmk	5980
+VPTESTMBZrr	5981
+VPTESTMBZrrk	5982
+VPTESTMDZ	5983
+VPTESTMDZrm	5984
+VPTESTMDZrmb	5985
+VPTESTMDZrmbk	5986
+VPTESTMDZrmk	5987
+VPTESTMDZrr	5988
+VPTESTMDZrrk	5989
+VPTESTMQZ	5990
+VPTESTMQZrm	5991
+VPTESTMQZrmb	5992
+VPTESTMQZrmbk	5993
+VPTESTMQZrmk	5994
+VPTESTMQZrr	5995
+VPTESTMQZrrk	5996
+VPTESTMWZ	5997
+VPTESTMWZrm	5998
+VPTESTMWZrmk	5999
+VPTESTMWZrr	6000
+VPTESTMWZrrk	6001
+VPTESTNMBZ	6002
+VPTESTNMBZrm	6003
+VPTESTNMBZrmk	6004
+VPTESTNMBZrr	6005
+VPTESTNMBZrrk	6006
+VPTESTNMDZ	6007
+VPTESTNMDZrm	6008
+VPTESTNMDZrmb	6009
+VPTESTNMDZrmbk	6010
+VPTESTNMDZrmk	6011
+VPTESTNMDZrr	6012
+VPTESTNMDZrrk	6013
+VPTESTNMQZ	6014
+VPTESTNMQZrm	6015
+VPTESTNMQZrmb	6016
+VPTESTNMQZrmbk	6017
+VPTESTNMQZrmk	6018
+VPTESTNMQZrr	6019
+VPTESTNMQZrrk	6020
+VPTESTNMWZ	6021
+VPTESTNMWZrm	6022
+VPTESTNMWZrmk	6023
+VPTESTNMWZrr	6024
+VPTESTNMWZrrk	6025
+VPTESTYrm	6026
+VPTESTYrr	6027
+VPTESTrm	6028
+VPTESTrr	6029
+VPUNPCKHBWYrm	6030
+VPUNPCKHBWYrr	6031
+VPUNPCKHBWZ	6032
+VPUNPCKHBWZrm	6033
+VPUNPCKHBWZrmk	6034
+VPUNPCKHBWZrmkz	6035
+VPUNPCKHBWZrr	6036
+VPUNPCKHBWZrrk	6037
+VPUNPCKHBWZrrkz	6038
+VPUNPCKHBWrm	6039
+VPUNPCKHBWrr	6040
+VPUNPCKHDQYrm	6041
+VPUNPCKHDQYrr	6042
+VPUNPCKHDQZ	6043
+VPUNPCKHDQZrm	6044
+VPUNPCKHDQZrmb	6045
+VPUNPCKHDQZrmbk	6046
+VPUNPCKHDQZrmbkz	6047
+VPUNPCKHDQZrmk	6048
+VPUNPCKHDQZrmkz	6049
+VPUNPCKHDQZrr	6050
+VPUNPCKHDQZrrk	6051
+VPUNPCKHDQZrrkz	6052
+VPUNPCKHDQrm	6053
+VPUNPCKHDQrr	6054
+VPUNPCKHQDQYrm	6055
+VPUNPCKHQDQYrr	6056
+VPUNPCKHQDQZ	6057
+VPUNPCKHQDQZrm	6058
+VPUNPCKHQDQZrmb	6059
+VPUNPCKHQDQZrmbk	6060
+VPUNPCKHQDQZrmbkz	6061
+VPUNPCKHQDQZrmk	6062
+VPUNPCKHQDQZrmkz	6063
+VPUNPCKHQDQZrr	6064
+VPUNPCKHQDQZrrk	6065
+VPUNPCKHQDQZrrkz	6066
+VPUNPCKHQDQrm	6067
+VPUNPCKHQDQrr	6068
+VPUNPCKHWDYrm	6069
+VPUNPCKHWDYrr	6070
+VPUNPCKHWDZ	6071
+VPUNPCKHWDZrm	6072
+VPUNPCKHWDZrmk	6073
+VPUNPCKHWDZrmkz	6074
+VPUNPCKHWDZrr	6075
+VPUNPCKHWDZrrk	6076
+VPUNPCKHWDZrrkz	6077
+VPUNPCKHWDrm	6078
+VPUNPCKHWDrr	6079
+VPUNPCKLBWYrm	6080
+VPUNPCKLBWYrr	6081
+VPUNPCKLBWZ	6082
+VPUNPCKLBWZrm	6083
+VPUNPCKLBWZrmk	6084
+VPUNPCKLBWZrmkz	6085
+VPUNPCKLBWZrr	6086
+VPUNPCKLBWZrrk	6087
+VPUNPCKLBWZrrkz	6088
+VPUNPCKLBWrm	6089
+VPUNPCKLBWrr	6090
+VPUNPCKLDQYrm	6091
+VPUNPCKLDQYrr	6092
+VPUNPCKLDQZ	6093
+VPUNPCKLDQZrm	6094
+VPUNPCKLDQZrmb	6095
+VPUNPCKLDQZrmbk	6096
+VPUNPCKLDQZrmbkz	6097
+VPUNPCKLDQZrmk	6098
+VPUNPCKLDQZrmkz	6099
+VPUNPCKLDQZrr	6100
+VPUNPCKLDQZrrk	6101
+VPUNPCKLDQZrrkz	6102
+VPUNPCKLDQrm	6103
+VPUNPCKLDQrr	6104
+VPUNPCKLQDQYrm	6105
+VPUNPCKLQDQYrr	6106
+VPUNPCKLQDQZ	6107
+VPUNPCKLQDQZrm	6108
+VPUNPCKLQDQZrmb	6109
+VPUNPCKLQDQZrmbk	6110
+VPUNPCKLQDQZrmbkz	6111
+VPUNPCKLQDQZrmk	6112
+VPUNPCKLQDQZrmkz	6113
+VPUNPCKLQDQZrr	6114
+VPUNPCKLQDQZrrk	6115
+VPUNPCKLQDQZrrkz	6116
+VPUNPCKLQDQrm	6117
+VPUNPCKLQDQrr	6118
+VPUNPCKLWDYrm	6119
+VPUNPCKLWDYrr	6120
+VPUNPCKLWDZ	6121
+VPUNPCKLWDZrm	6122
+VPUNPCKLWDZrmk	6123
+VPUNPCKLWDZrmkz	6124
+VPUNPCKLWDZrr	6125
+VPUNPCKLWDZrrk	6126
+VPUNPCKLWDZrrkz	6127
+VPUNPCKLWDrm	6128
+VPUNPCKLWDrr	6129
+VPXORDZ	6130
+VPXORDZrm	6131
+VPXORDZrmb	6132
+VPXORDZrmbk	6133
+VPXORDZrmbkz	6134
+VPXORDZrmk	6135
+VPXORDZrmkz	6136
+VPXORDZrr	6137
+VPXORDZrrk	6138
+VPXORDZrrkz	6139
+VPXORQZ	6140
+VPXORQZrm	6141
+VPXORQZrmb	6142
+VPXORQZrmbk	6143
+VPXORQZrmbkz	6144
+VPXORQZrmk	6145
+VPXORQZrmkz	6146
+VPXORQZrr	6147
+VPXORQZrrk	6148
+VPXORQZrrkz	6149
+VPXORYrm	6150
+VPXORYrr	6151
+VPXORrm	6152
+VPXORrr	6153
+VRANGEPDZ	6154
+VRANGEPDZrmbi	6155
+VRANGEPDZrmbik	6156
+VRANGEPDZrmbikz	6157
+VRANGEPDZrmi	6158
+VRANGEPDZrmik	6159
+VRANGEPDZrmikz	6160
+VRANGEPDZrri	6161
+VRANGEPDZrrib	6162
+VRANGEPDZrribk	6163
+VRANGEPDZrribkz	6164
+VRANGEPDZrrik	6165
+VRANGEPDZrrikz	6166
+VRANGEPSZ	6167
+VRANGEPSZrmbi	6168
+VRANGEPSZrmbik	6169
+VRANGEPSZrmbikz	6170
+VRANGEPSZrmi	6171
+VRANGEPSZrmik	6172
+VRANGEPSZrmikz	6173
+VRANGEPSZrri	6174
+VRANGEPSZrrib	6175
+VRANGEPSZrribk	6176
+VRANGEPSZrribkz	6177
+VRANGEPSZrrik	6178
+VRANGEPSZrrikz	6179
+VRANGESDZrmi	6180
+VRANGESDZrmik	6181
+VRANGESDZrmikz	6182
+VRANGESDZrri	6183
+VRANGESDZrrib	6184
+VRANGESDZrribk	6185
+VRANGESDZrribkz	6186
+VRANGESDZrrik	6187
+VRANGESDZrrikz	6188
+VRANGESSZrmi	6189
+VRANGESSZrmik	6190
+VRANGESSZrmikz	6191
+VRANGESSZrri	6192
+VRANGESSZrrib	6193
+VRANGESSZrribk	6194
+VRANGESSZrribkz	6195
+VRANGESSZrrik	6196
+VRANGESSZrrikz	6197
+VRCP	6198
+VRCPBF	6199
+VRCPPHZ	6200
+VRCPPHZm	6201
+VRCPPHZmb	6202
+VRCPPHZmbk	6203
+VRCPPHZmbkz	6204
+VRCPPHZmk	6205
+VRCPPHZmkz	6206
+VRCPPHZr	6207
+VRCPPHZrk	6208
+VRCPPHZrkz	6209
+VRCPPSYm	6210
+VRCPPSYr	6211
+VRCPPSm	6212
+VRCPPSr	6213
+VRCPSHZrm	6214
+VRCPSHZrmk	6215
+VRCPSHZrmkz	6216
+VRCPSHZrr	6217
+VRCPSHZrrk	6218
+VRCPSHZrrkz	6219
+VRCPSSm	6220
+VRCPSSm_Int	6221
+VRCPSSr	6222
+VRCPSSr_Int	6223
+VREDUCEBF	6224
+VREDUCEPDZ	6225
+VREDUCEPDZrmbi	6226
+VREDUCEPDZrmbik	6227
+VREDUCEPDZrmbikz	6228
+VREDUCEPDZrmi	6229
+VREDUCEPDZrmik	6230
+VREDUCEPDZrmikz	6231
+VREDUCEPDZrri	6232
+VREDUCEPDZrrib	6233
+VREDUCEPDZrribk	6234
+VREDUCEPDZrribkz	6235
+VREDUCEPDZrrik	6236
+VREDUCEPDZrrikz	6237
+VREDUCEPHZ	6238
+VREDUCEPHZrmbi	6239
+VREDUCEPHZrmbik	6240
+VREDUCEPHZrmbikz	6241
+VREDUCEPHZrmi	6242
+VREDUCEPHZrmik	6243
+VREDUCEPHZrmikz	6244
+VREDUCEPHZrri	6245
+VREDUCEPHZrrib	6246
+VREDUCEPHZrribk	6247
+VREDUCEPHZrribkz	6248
+VREDUCEPHZrrik	6249
+VREDUCEPHZrrikz	6250
+VREDUCEPSZ	6251
+VREDUCEPSZrmbi	6252
+VREDUCEPSZrmbik	6253
+VREDUCEPSZrmbikz	6254
+VREDUCEPSZrmi	6255
+VREDUCEPSZrmik	6256
+VREDUCEPSZrmikz	6257
+VREDUCEPSZrri	6258
+VREDUCEPSZrrib	6259
+VREDUCEPSZrribk	6260
+VREDUCEPSZrribkz	6261
+VREDUCEPSZrrik	6262
+VREDUCEPSZrrikz	6263
+VREDUCESDZrmi	6264
+VREDUCESDZrmik	6265
+VREDUCESDZrmikz	6266
+VREDUCESDZrri	6267
+VREDUCESDZrrib	6268
+VREDUCESDZrribk	6269
+VREDUCESDZrribkz	6270
+VREDUCESDZrrik	6271
+VREDUCESDZrrikz	6272
+VREDUCESHZrmi	6273
+VREDUCESHZrmik	6274
+VREDUCESHZrmikz	6275
+VREDUCESHZrri	6276
+VREDUCESHZrrib	6277
+VREDUCESHZrribk	6278
+VREDUCESHZrribkz	6279
+VREDUCESHZrrik	6280
+VREDUCESHZrrikz	6281
+VREDUCESSZrmi	6282
+VREDUCESSZrmik	6283
+VREDUCESSZrmikz	6284
+VREDUCESSZrri	6285
+VREDUCESSZrrib	6286
+VREDUCESSZrribk	6287
+VREDUCESSZrribkz	6288
+VREDUCESSZrrik	6289
+VREDUCESSZrrikz	6290
+VRNDSCALEBF	6291
+VRNDSCALEPDZ	6292
+VRNDSCALEPDZrmbi	6293
+VRNDSCALEPDZrmbik	6294
+VRNDSCALEPDZrmbikz	6295
+VRNDSCALEPDZrmi	6296
+VRNDSCALEPDZrmik	6297
+VRNDSCALEPDZrmikz	6298
+VRNDSCALEPDZrri	6299
+VRNDSCALEPDZrrib	6300
+VRNDSCALEPDZrribk	6301
+VRNDSCALEPDZrribkz	6302
+VRNDSCALEPDZrrik	6303
+VRNDSCALEPDZrrikz	6304
+VRNDSCALEPHZ	6305
+VRNDSCALEPHZrmbi	6306
+VRNDSCALEPHZrmbik	6307
+VRNDSCALEPHZrmbikz	6308
+VRNDSCALEPHZrmi	6309
+VRNDSCALEPHZrmik	6310
+VRNDSCALEPHZrmikz	6311
+VRNDSCALEPHZrri	6312
+VRNDSCALEPHZrrib	6313
+VRNDSCALEPHZrribk	6314
+VRNDSCALEPHZrribkz	6315
+VRNDSCALEPHZrrik	6316
+VRNDSCALEPHZrrikz	6317
+VRNDSCALEPSZ	6318
+VRNDSCALEPSZrmbi	6319
+VRNDSCALEPSZrmbik	6320
+VRNDSCALEPSZrmbikz	6321
+VRNDSCALEPSZrmi	6322
+VRNDSCALEPSZrmik	6323
+VRNDSCALEPSZrmikz	6324
+VRNDSCALEPSZrri	6325
+VRNDSCALEPSZrrib	6326
+VRNDSCALEPSZrribk	6327
+VRNDSCALEPSZrribkz	6328
+VRNDSCALEPSZrrik	6329
+VRNDSCALEPSZrrikz	6330
+VRNDSCALESDZrmi	6331
+VRNDSCALESDZrmi_Int	6332
+VRNDSCALESDZrmik_Int	6333
+VRNDSCALESDZrmikz_Int	6334
+VRNDSCALESDZrri	6335
+VRNDSCALESDZrri_Int	6336
+VRNDSCALESDZrrib_Int	6337
+VRNDSCALESDZrribk_Int	6338
+VRNDSCALESDZrribkz_Int	6339
+VRNDSCALESDZrrik_Int	6340
+VRNDSCALESDZrrikz_Int	6341
+VRNDSCALESHZrmi	6342
+VRNDSCALESHZrmi_Int	6343
+VRNDSCALESHZrmik_Int	6344
+VRNDSCALESHZrmikz_Int	6345
+VRNDSCALESHZrri	6346
+VRNDSCALESHZrri_Int	6347
+VRNDSCALESHZrrib_Int	6348
+VRNDSCALESHZrribk_Int	6349
+VRNDSCALESHZrribkz_Int	6350
+VRNDSCALESHZrrik_Int	6351
+VRNDSCALESHZrrikz_Int	6352
+VRNDSCALESSZrmi	6353
+VRNDSCALESSZrmi_Int	6354
+VRNDSCALESSZrmik_Int	6355
+VRNDSCALESSZrmikz_Int	6356
+VRNDSCALESSZrri	6357
+VRNDSCALESSZrri_Int	6358
+VRNDSCALESSZrrib_Int	6359
+VRNDSCALESSZrribk_Int	6360
+VRNDSCALESSZrribkz_Int	6361
+VRNDSCALESSZrrik_Int	6362
+VRNDSCALESSZrrikz_Int	6363
+VROUNDPDYmi	6364
+VROUNDPDYri	6365
+VROUNDPDmi	6366
+VROUNDPDri	6367
+VROUNDPSYmi	6368
+VROUNDPSYri	6369
+VROUNDPSmi	6370
+VROUNDPSri	6371
+VROUNDSDmi	6372
+VROUNDSDmi_Int	6373
+VROUNDSDri	6374
+VROUNDSDri_Int	6375
+VROUNDSSmi	6376
+VROUNDSSmi_Int	6377
+VROUNDSSri	6378
+VROUNDSSri_Int	6379
+VRSQRT	6380
+VRSQRTBF	6381
+VRSQRTPHZ	6382
+VRSQRTPHZm	6383
+VRSQRTPHZmb	6384
+VRSQRTPHZmbk	6385
+VRSQRTPHZmbkz	6386
+VRSQRTPHZmk	6387
+VRSQRTPHZmkz	6388
+VRSQRTPHZr	6389
+VRSQRTPHZrk	6390
+VRSQRTPHZrkz	6391
+VRSQRTPSYm	6392
+VRSQRTPSYr	6393
+VRSQRTPSm	6394
+VRSQRTPSr	6395
+VRSQRTSHZrm	6396
+VRSQRTSHZrmk	6397
+VRSQRTSHZrmkz	6398
+VRSQRTSHZrr	6399
+VRSQRTSHZrrk	6400
+VRSQRTSHZrrkz	6401
+VRSQRTSSm	6402
+VRSQRTSSm_Int	6403
+VRSQRTSSr	6404
+VRSQRTSSr_Int	6405
+VSCALEFBF	6406
+VSCALEFPDZ	6407
+VSCALEFPDZrm	6408
+VSCALEFPDZrmb	6409
+VSCALEFPDZrmbk	6410
+VSCALEFPDZrmbkz	6411
+VSCALEFPDZrmk	6412
+VSCALEFPDZrmkz	6413
+VSCALEFPDZrr	6414
+VSCALEFPDZrrb	6415
+VSCALEFPDZrrbk	6416
+VSCALEFPDZrrbkz	6417
+VSCALEFPDZrrk	6418
+VSCALEFPDZrrkz	6419
+VSCALEFPHZ	6420
+VSCALEFPHZrm	6421
+VSCALEFPHZrmb	6422
+VSCALEFPHZrmbk	6423
+VSCALEFPHZrmbkz	6424
+VSCALEFPHZrmk	6425
+VSCALEFPHZrmkz	6426
+VSCALEFPHZrr	6427
+VSCALEFPHZrrb	6428
+VSCALEFPHZrrbk	6429
+VSCALEFPHZrrbkz	6430
+VSCALEFPHZrrk	6431
+VSCALEFPHZrrkz	6432
+VSCALEFPSZ	6433
+VSCALEFPSZrm	6434
+VSCALEFPSZrmb	6435
+VSCALEFPSZrmbk	6436
+VSCALEFPSZrmbkz	6437
+VSCALEFPSZrmk	6438
+VSCALEFPSZrmkz	6439
+VSCALEFPSZrr	6440
+VSCALEFPSZrrb	6441
+VSCALEFPSZrrbk	6442
+VSCALEFPSZrrbkz	6443
+VSCALEFPSZrrk	6444
+VSCALEFPSZrrkz	6445
+VSCALEFSDZrm	6446
+VSCALEFSDZrmk	6447
+VSCALEFSDZrmkz	6448
+VSCALEFSDZrr	6449
+VSCALEFSDZrrb_Int	6450
+VSCALEFSDZrrbk_Int	6451
+VSCALEFSDZrrbkz_Int	6452
+VSCALEFSDZrrk	6453
+VSCALEFSDZrrkz	6454
+VSCALEFSHZrm	6455
+VSCALEFSHZrmk	6456
+VSCALEFSHZrmkz	6457
+VSCALEFSHZrr	6458
+VSCALEFSHZrrb_Int	6459
+VSCALEFSHZrrbk_Int	6460
+VSCALEFSHZrrbkz_Int	6461
+VSCALEFSHZrrk	6462
+VSCALEFSHZrrkz	6463
+VSCALEFSSZrm	6464
+VSCALEFSSZrmk	6465
+VSCALEFSSZrmkz	6466
+VSCALEFSSZrr	6467
+VSCALEFSSZrrb_Int	6468
+VSCALEFSSZrrbk_Int	6469
+VSCALEFSSZrrbkz_Int	6470
+VSCALEFSSZrrk	6471
+VSCALEFSSZrrkz	6472
+VSCATTERDPDZ	6473
+VSCATTERDPDZmr	6474
+VSCATTERDPSZ	6475
+VSCATTERDPSZmr	6476
+VSCATTERPF	6477
+VSCATTERQPDZ	6478
+VSCATTERQPDZmr	6479
+VSCATTERQPSZ	6480
+VSCATTERQPSZmr	6481
+VSHA	6482
+VSHUFF	6483
+VSHUFI	6484
+VSHUFPDYrmi	6485
+VSHUFPDYrri	6486
+VSHUFPDZ	6487
+VSHUFPDZrmbi	6488
+VSHUFPDZrmbik	6489
+VSHUFPDZrmbikz	6490
+VSHUFPDZrmi	6491
+VSHUFPDZrmik	6492
+VSHUFPDZrmikz	6493
+VSHUFPDZrri	6494
+VSHUFPDZrrik	6495
+VSHUFPDZrrikz	6496
+VSHUFPDrmi	6497
+VSHUFPDrri	6498
+VSHUFPSYrmi	6499
+VSHUFPSYrri	6500
+VSHUFPSZ	6501
+VSHUFPSZrmbi	6502
+VSHUFPSZrmbik	6503
+VSHUFPSZrmbikz	6504
+VSHUFPSZrmi	6505
+VSHUFPSZrmik	6506
+VSHUFPSZrmikz	6507
+VSHUFPSZrri	6508
+VSHUFPSZrrik	6509
+VSHUFPSZrrikz	6510
+VSHUFPSrmi	6511
+VSHUFPSrri	6512
+VSM	6513
+VSQRTBF	6514
+VSQRTPDYm	6515
+VSQRTPDYr	6516
+VSQRTPDZ	6517
+VSQRTPDZm	6518
+VSQRTPDZmb	6519
+VSQRTPDZmbk	6520
+VSQRTPDZmbkz	6521
+VSQRTPDZmk	6522
+VSQRTPDZmkz	6523
+VSQRTPDZr	6524
+VSQRTPDZrb	6525
+VSQRTPDZrbk	6526
+VSQRTPDZrbkz	6527
+VSQRTPDZrk	6528
+VSQRTPDZrkz	6529
+VSQRTPDm	6530
+VSQRTPDr	6531
+VSQRTPHZ	6532
+VSQRTPHZm	6533
+VSQRTPHZmb	6534
+VSQRTPHZmbk	6535
+VSQRTPHZmbkz	6536
+VSQRTPHZmk	6537
+VSQRTPHZmkz	6538
+VSQRTPHZr	6539
+VSQRTPHZrb	6540
+VSQRTPHZrbk	6541
+VSQRTPHZrbkz	6542
+VSQRTPHZrk	6543
+VSQRTPHZrkz	6544
+VSQRTPSYm	6545
+VSQRTPSYr	6546
+VSQRTPSZ	6547
+VSQRTPSZm	6548
+VSQRTPSZmb	6549
+VSQRTPSZmbk	6550
+VSQRTPSZmbkz	6551
+VSQRTPSZmk	6552
+VSQRTPSZmkz	6553
+VSQRTPSZr	6554
+VSQRTPSZrb	6555
+VSQRTPSZrbk	6556
+VSQRTPSZrbkz	6557
+VSQRTPSZrk	6558
+VSQRTPSZrkz	6559
+VSQRTPSm	6560
+VSQRTPSr	6561
+VSQRTSDZm	6562
+VSQRTSDZm_Int	6563
+VSQRTSDZmk_Int	6564
+VSQRTSDZmkz_Int	6565
+VSQRTSDZr	6566
+VSQRTSDZr_Int	6567
+VSQRTSDZrb_Int	6568
+VSQRTSDZrbk_Int	6569
+VSQRTSDZrbkz_Int	6570
+VSQRTSDZrk_Int	6571
+VSQRTSDZrkz_Int	6572
+VSQRTSDm	6573
+VSQRTSDm_Int	6574
+VSQRTSDr	6575
+VSQRTSDr_Int	6576
+VSQRTSHZm	6577
+VSQRTSHZm_Int	6578
+VSQRTSHZmk_Int	6579
+VSQRTSHZmkz_Int	6580
+VSQRTSHZr	6581
+VSQRTSHZr_Int	6582
+VSQRTSHZrb_Int	6583
+VSQRTSHZrbk_Int	6584
+VSQRTSHZrbkz_Int	6585
+VSQRTSHZrk_Int	6586
+VSQRTSHZrkz_Int	6587
+VSQRTSSZm	6588
+VSQRTSSZm_Int	6589
+VSQRTSSZmk_Int	6590
+VSQRTSSZmkz_Int	6591
+VSQRTSSZr	6592
+VSQRTSSZr_Int	6593
+VSQRTSSZrb_Int	6594
+VSQRTSSZrbk_Int	6595
+VSQRTSSZrbkz_Int	6596
+VSQRTSSZrk_Int	6597
+VSQRTSSZrkz_Int	6598
+VSQRTSSm	6599
+VSQRTSSm_Int	6600
+VSQRTSSr	6601
+VSQRTSSr_Int	6602
+VSTMXCSR	6603
+VSUBBF	6604
+VSUBPDYrm	6605
+VSUBPDYrr	6606
+VSUBPDZ	6607
+VSUBPDZrm	6608
+VSUBPDZrmb	6609
+VSUBPDZrmbk	6610
+VSUBPDZrmbkz	6611
+VSUBPDZrmk	6612
+VSUBPDZrmkz	6613
+VSUBPDZrr	6614
+VSUBPDZrrb	6615
+VSUBPDZrrbk	6616
+VSUBPDZrrbkz	6617
+VSUBPDZrrk	6618
+VSUBPDZrrkz	6619
+VSUBPDrm	6620
+VSUBPDrr	6621
+VSUBPHZ	6622
+VSUBPHZrm	6623
+VSUBPHZrmb	6624
+VSUBPHZrmbk	6625
+VSUBPHZrmbkz	6626
+VSUBPHZrmk	6627
+VSUBPHZrmkz	6628
+VSUBPHZrr	6629
+VSUBPHZrrb	6630
+VSUBPHZrrbk	6631
+VSUBPHZrrbkz	6632
+VSUBPHZrrk	6633
+VSUBPHZrrkz	6634
+VSUBPSYrm	6635
+VSUBPSYrr	6636
+VSUBPSZ	6637
+VSUBPSZrm	6638
+VSUBPSZrmb	6639
+VSUBPSZrmbk	6640
+VSUBPSZrmbkz	6641
+VSUBPSZrmk	6642
+VSUBPSZrmkz	6643
+VSUBPSZrr	6644
+VSUBPSZrrb	6645
+VSUBPSZrrbk	6646
+VSUBPSZrrbkz	6647
+VSUBPSZrrk	6648
+VSUBPSZrrkz	6649
+VSUBPSrm	6650
+VSUBPSrr	6651
+VSUBSDZrm	6652
+VSUBSDZrm_Int	6653
+VSUBSDZrmk_Int	6654
+VSUBSDZrmkz_Int	6655
+VSUBSDZrr	6656
+VSUBSDZrr_Int	6657
+VSUBSDZrrb_Int	6658
+VSUBSDZrrbk_Int	6659
+VSUBSDZrrbkz_Int	6660
+VSUBSDZrrk_Int	6661
+VSUBSDZrrkz_Int	6662
+VSUBSDrm	6663
+VSUBSDrm_Int	6664
+VSUBSDrr	6665
+VSUBSDrr_Int	6666
+VSUBSHZrm	6667
+VSUBSHZrm_Int	6668
+VSUBSHZrmk_Int	6669
+VSUBSHZrmkz_Int	6670
+VSUBSHZrr	6671
+VSUBSHZrr_Int	6672
+VSUBSHZrrb_Int	6673
+VSUBSHZrrbk_Int	6674
+VSUBSHZrrbkz_Int	6675
+VSUBSHZrrk_Int	6676
+VSUBSHZrrkz_Int	6677
+VSUBSSZrm	6678
+VSUBSSZrm_Int	6679
+VSUBSSZrmk_Int	6680
+VSUBSSZrmkz_Int	6681
+VSUBSSZrr	6682
+VSUBSSZrr_Int	6683
+VSUBSSZrrb_Int	6684
+VSUBSSZrrbk_Int	6685
+VSUBSSZrrbkz_Int	6686
+VSUBSSZrrk_Int	6687
+VSUBSSZrrkz_Int	6688
+VSUBSSrm	6689
+VSUBSSrm_Int	6690
+VSUBSSrr	6691
+VSUBSSrr_Int	6692
+VTESTPDYrm	6693
+VTESTPDYrr	6694
+VTESTPDrm	6695
+VTESTPDrr	6696
+VTESTPSYrm	6697
+VTESTPSYrr	6698
+VTESTPSrm	6699
+VTESTPSrr	6700
+VUCOMISDZrm	6701
+VUCOMISDZrm_Int	6702
+VUCOMISDZrr	6703
+VUCOMISDZrr_Int	6704
+VUCOMISDZrrb	6705
+VUCOMISDrm	6706
+VUCOMISDrm_Int	6707
+VUCOMISDrr	6708
+VUCOMISDrr_Int	6709
+VUCOMISHZrm	6710
+VUCOMISHZrm_Int	6711
+VUCOMISHZrr	6712
+VUCOMISHZrr_Int	6713
+VUCOMISHZrrb	6714
+VUCOMISSZrm	6715
+VUCOMISSZrm_Int	6716
+VUCOMISSZrr	6717
+VUCOMISSZrr_Int	6718
+VUCOMISSZrrb	6719
+VUCOMISSrm	6720
+VUCOMISSrm_Int	6721
+VUCOMISSrr	6722
+VUCOMISSrr_Int	6723
+VUCOMXSDZrm	6724
+VUCOMXSDZrm_Int	6725
+VUCOMXSDZrr	6726
+VUCOMXSDZrr_Int	6727
+VUCOMXSDZrrb_Int	6728
+VUCOMXSHZrm	6729
+VUCOMXSHZrm_Int	6730
+VUCOMXSHZrr	6731
+VUCOMXSHZrr_Int	6732
+VUCOMXSHZrrb_Int	6733
+VUCOMXSSZrm	6734
+VUCOMXSSZrm_Int	6735
+VUCOMXSSZrr	6736
+VUCOMXSSZrr_Int	6737
+VUCOMXSSZrrb_Int	6738
+VUNPCKHPDYrm	6739
+VUNPCKHPDYrr	6740
+VUNPCKHPDZ	6741
+VUNPCKHPDZrm	6742
+VUNPCKHPDZrmb	6743
+VUNPCKHPDZrmbk	6744
+VUNPCKHPDZrmbkz	6745
+VUNPCKHPDZrmk	6746
+VUNPCKHPDZrmkz	6747
+VUNPCKHPDZrr	6748
+VUNPCKHPDZrrk	6749
+VUNPCKHPDZrrkz	6750
+VUNPCKHPDrm	6751
+VUNPCKHPDrr	6752
+VUNPCKHPSYrm	6753
+VUNPCKHPSYrr	6754
+VUNPCKHPSZ	6755
+VUNPCKHPSZrm	6756
+VUNPCKHPSZrmb	6757
+VUNPCKHPSZrmbk	6758
+VUNPCKHPSZrmbkz	6759
+VUNPCKHPSZrmk	6760
+VUNPCKHPSZrmkz	6761
+VUNPCKHPSZrr	6762
+VUNPCKHPSZrrk	6763
+VUNPCKHPSZrrkz	6764
+VUNPCKHPSrm	6765
+VUNPCKHPSrr	6766
+VUNPCKLPDYrm	6767
+VUNPCKLPDYrr	6768
+VUNPCKLPDZ	6769
+VUNPCKLPDZrm	6770
+VUNPCKLPDZrmb	6771
+VUNPCKLPDZrmbk	6772
+VUNPCKLPDZrmbkz	6773
+VUNPCKLPDZrmk	6774
+VUNPCKLPDZrmkz	6775
+VUNPCKLPDZrr	6776
+VUNPCKLPDZrrk	6777
+VUNPCKLPDZrrkz	6778
+VUNPCKLPDrm	6779
+VUNPCKLPDrr	6780
+VUNPCKLPSYrm	6781
+VUNPCKLPSYrr	6782
+VUNPCKLPSZ	6783
+VUNPCKLPSZrm	6784
+VUNPCKLPSZrmb	6785
+VUNPCKLPSZrmbk	6786
+VUNPCKLPSZrmbkz	6787
+VUNPCKLPSZrmk	6788
+VUNPCKLPSZrmkz	6789
+VUNPCKLPSZrr	6790
+VUNPCKLPSZrrk	6791
+VUNPCKLPSZrrkz	6792
+VUNPCKLPSrm	6793
+VUNPCKLPSrr	6794
+VXORPDYrm	6795
+VXORPDYrr	6796
+VXORPDZ	6797
+VXORPDZrm	6798
+VXORPDZrmb	6799
+VXORPDZrmbk	6800
+VXORPDZrmbkz	6801
+VXORPDZrmk	6802
+VXORPDZrmkz	6803
+VXORPDZrr	6804
+VXORPDZrrk	6805
+VXORPDZrrkz	6806
+VXORPDrm	6807
+VXORPDrr	6808
+VXORPSYrm	6809
+VXORPSYrr	6810
+VXORPSZ	6811
+VXORPSZrm	6812
+VXORPSZrmb	6813
+VXORPSZrmbk	6814
+VXORPSZrmbkz	6815
+VXORPSZrmk	6816
+VXORPSZrmkz	6817
+VXORPSZrr	6818
+VXORPSZrrk	6819
+VXORPSZrrkz	6820
+VXORPSrm	6821
+VXORPSrr	6822
+VZEROALL	6823
+VZEROUPPER	6824
+V_SET	6825
+V_SETALLONES	6826
+WAIT	6827
+WBINVD	6828
+WBNOINVD	6829
+WRFLAGS	6830
+WRFSBASE	6831
+WRGSBASE	6832
+WRMSR	6833
+WRMSRLIST	6834
+WRMSRNS	6835
+WRMSRNSir	6836
+WRMSRNSir_EVEX	6837
+WRPKRUr	6838
+WRSSD	6839
+WRSSD_EVEX	6840
+WRSSQ	6841
+WRSSQ_EVEX	6842
+WRUSSD	6843
+WRUSSD_EVEX	6844
+WRUSSQ	6845
+WRUSSQ_EVEX	6846
+XABORT	6847
+XABORT_DEF	6848
+XACQUIRE_PREFIX	6849
+XADD	6850
+XAM_F	6851
+XAM_Fp	6852
+XBEGIN	6853
+XCHG	6854
+XCH_F	6855
+XCRYPTCBC	6856
+XCRYPTCFB	6857
+XCRYPTCTR	6858
+XCRYPTECB	6859
+XCRYPTOFB	6860
+XEND	6861
+XGETBV	6862
+XLAT	6863
+XOR	6864
+XORPDrm	6865
+XORPDrr	6866
+XORPSrm	6867
+XORPSrr	6868
+XRELEASE_PREFIX	6869
+XRESLDTRK	6870
+XRSTOR	6871
+XRSTORS	6872
+XSAVE	6873
+XSAVEC	6874
+XSAVEOPT	6875
+XSAVES	6876
+XSETBV	6877
+XSHA	6878
+XSTORE	6879
+XSUSLDTRK	6880
+XTEST	6881
+Immediate	6882
+CImmediate	6883
+FPImmediate	6884
+MBB	6885
+FrameIndex	6886
+ConstantPoolIndex	6887
+TargetIndex	6888
+JumpTableIndex	6889
+ExternalSymbol	6890
+GlobalAddress	6891
+BlockAddress	6892
+RegisterMask	6893
+RegisterLiveOut	6894
+Metadata	6895
+MCSymbol	6896
+CFIIndex	6897
+IntrinsicID	6898
+Predicate	6899
+ShuffleMask	6900
+PhyReg_GR8	6901
+PhyReg_GRH8	6902
+PhyReg_GR8_NOREX2	6903
+PhyReg_GR8_NOREX	6904
+PhyReg_GR8_ABCD_H	6905
+PhyReg_GR8_ABCD_L	6906
+PhyReg_GRH16	6907
+PhyReg_GR16	6908
+PhyReg_GR16_NOREX2	6909
+PhyReg_GR16_NOREX	6910
+PhyReg_VK1	6911
+PhyReg_VK16	6912
+PhyReg_VK2	6913
+PhyReg_VK4	6914
+PhyReg_VK8	6915
+PhyReg_VK16WM	6916
+PhyReg_VK1WM	6917
+PhyReg_VK2WM	6918
+PhyReg_VK4WM	6919
+PhyReg_VK8WM	6920
+PhyReg_SEGMENT_REG	6921
+PhyReg_GR16_ABCD	6922
+PhyReg_FPCCR	6923
+PhyReg_FR16X	6924
+PhyReg_FR16	6925
+PhyReg_VK16PAIR	6926
+PhyReg_VK1PAIR	6927
+PhyReg_VK2PAIR	6928
+PhyReg_VK4PAIR	6929
+PhyReg_VK8PAIR	6930
+PhyReg_VK1PAIR_with_sub_mask_0_in_VK1WM	6931
+PhyReg_LOW32_ADDR_ACCESS_RBP	6932
+PhyReg_LOW32_ADDR_ACCESS	6933
+PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit	6934
+PhyReg_FR32X	6935
+PhyReg_GR32	6936
+PhyReg_GR32_NOSP	6937
+PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2	6938
+PhyReg_DEBUG_REG	6939
+PhyReg_FR32	6940
+PhyReg_GR32_NOREX2	6941
+PhyReg_GR32_NOREX2_NOSP	6942
+PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX	6943
+PhyReg_GR32_NOREX	6944
+PhyReg_VK32	6945
+PhyReg_GR32_NOREX_NOSP	6946
+PhyReg_RFP32	6947
+PhyReg_VK32WM	6948
+PhyReg_GR32_ABCD	6949
+PhyReg_GR32_TC	6950
+PhyReg_GR32_ABCD_and_GR32_TC	6951
+PhyReg_GR32_AD	6952
+PhyReg_GR32_ArgRef	6953
+PhyReg_GR32_BPSP	6954
+PhyReg_GR32_BSI	6955
+PhyReg_GR32_CB	6956
+PhyReg_GR32_DC	6957
+PhyReg_GR32_DIBP	6958
+PhyReg_GR32_SIDI	6959
+PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit	6960
+PhyReg_CCR	6961
+PhyReg_DFCCR	6962
+PhyReg_GR32_ABCD_and_GR32_BSI	6963
+PhyReg_GR32_AD_and_GR32_ArgRef	6964
+PhyReg_GR32_ArgRef_and_GR32_CB	6965
+PhyReg_GR32_BPSP_and_GR32_DIBP	6966
+PhyReg_GR32_BPSP_and_GR32_TC	6967
+PhyReg_GR32_BSI_and_GR32_SIDI	6968
+PhyReg_GR32_DIBP_and_GR32_SIDI	6969
+PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit	6970
+PhyReg_LOW32_ADDR_ACCESS_with_sub_32bit	6971
+PhyReg_RFP64	6972
+PhyReg_GR64	6973
+PhyReg_FR64X	6974
+PhyReg_GR64_with_sub_8bit	6975
+PhyReg_GR64_NOSP	6976
+PhyReg_GR64_NOREX2	6977
+PhyReg_CONTROL_REG	6978
+PhyReg_FR64	6979
+PhyReg_GR64_with_sub_16bit_in_GR16_NOREX2	6980
+PhyReg_GR64_NOREX2_NOSP	6981
+PhyReg_GR64PLTSafe	6982
+PhyReg_GR64_TC	6983
+PhyReg_GR64_NOREX	6984
+PhyReg_GR64_TCW64	6985
+PhyReg_GR64_TC_with_sub_8bit	6986
+PhyReg_GR64_NOREX2_NOSP_and_GR64_TC	6987
+PhyReg_GR64_TCW64_with_sub_8bit	6988
+PhyReg_GR64_TC_and_GR64_TCW64	6989
+PhyReg_GR64_with_sub_16bit_in_GR16_NOREX	6990
+PhyReg_VK64	6991
+PhyReg_VR64	6992
+PhyReg_GR64PLTSafe_and_GR64_TC	6993
+PhyReg_GR64_NOREX2_NOSP_and_GR64_TCW64	6994
+PhyReg_GR64_NOREX_NOSP	6995
+PhyReg_GR64_NOREX_and_GR64_TC	6996
+PhyReg_GR64_TCW64_and_GR64_TC_with_sub_8bit	6997
+PhyReg_VK64WM	6998
+PhyReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64	6999
+PhyReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX	7000
+PhyReg_GR64PLTSafe_and_GR64_TCW64	7001
+PhyReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC	7002
+PhyReg_GR64_NOREX_and_GR64_TCW64	7003
+PhyReg_GR64_ABCD	7004
+PhyReg_GR64_with_sub_32bit_in_GR32_TC	7005
+PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC	7006
+PhyReg_GR64_AD	7007
+PhyReg_GR64_ArgRef	7008
+PhyReg_GR64_and_LOW32_ADDR_ACCESS_RBP	7009
+PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef	7010
+PhyReg_GR64_with_sub_32bit_in_GR32_BPSP	7011
+PhyReg_GR64_with_sub_32bit_in_GR32_BSI	7012
+PhyReg_GR64_with_sub_32bit_in_GR32_CB	7013
+PhyReg_GR64_with_sub_32bit_in_GR32_DIBP	7014
+PhyReg_GR64_with_sub_32bit_in_GR32_SIDI	7015
+PhyReg_GR64_A	7016
+PhyReg_GR64_ArgRef_and_GR64_TC	7017
+PhyReg_GR64_and_LOW32_ADDR_ACCESS	7018
+PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI	7019
+PhyReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef	7020
+PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB	7021
+PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP	7022
+PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC	7023
+PhyReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI	7024
+PhyReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI	7025
+PhyReg_RST	7026
+PhyReg_RFP80	7027
+PhyReg_RFP80_7	7028
+PhyReg_VR128X	7029
+PhyReg_VR128	7030
+PhyReg_VR256X	7031
+PhyReg_VR256	7032
+PhyReg_VR512	7033
+PhyReg_VR512_0_15	7034
+PhyReg_TILE	7035
+PhyReg_TILEPAIR	7036
+VirtReg_GR8	7037
+VirtReg_GRH8	7038
+VirtReg_GR8_NOREX2	7039
+VirtReg_GR8_NOREX	7040
+VirtReg_GR8_ABCD_H	7041
+VirtReg_GR8_ABCD_L	7042
+VirtReg_GRH16	7043
+VirtReg_GR16	7044
+VirtReg_GR16_NOREX2	7045
+VirtReg_GR16_NOREX	7046
+VirtReg_VK1	7047
+VirtReg_VK16	7048
+VirtReg_VK2	7049
+VirtReg_VK4	7050
+VirtReg_VK8	7051
+VirtReg_VK16WM	7052
+VirtReg_VK1WM	7053
+VirtReg_VK2WM	7054
+VirtReg_VK4WM	7055
+VirtReg_VK8WM	7056
+VirtReg_SEGMENT_REG	7057
+VirtReg_GR16_ABCD	7058
+VirtReg_FPCCR	7059
+VirtReg_FR16X	7060
+VirtReg_FR16	7061
+VirtReg_VK16PAIR	7062
+VirtReg_VK1PAIR	7063
+VirtReg_VK2PAIR	7064
+VirtReg_VK4PAIR	7065
+VirtReg_VK8PAIR	7066
+VirtReg_VK1PAIR_with_sub_mask_0_in_VK1WM	7067
+VirtReg_LOW32_ADDR_ACCESS_RBP	7068
+VirtReg_LOW32_ADDR_ACCESS	7069
+VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit	7070
+VirtReg_FR32X	7071
+VirtReg_GR32	7072
+VirtReg_GR32_NOSP	7073
+VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2	7074
+VirtReg_DEBUG_REG	7075
+VirtReg_FR32	7076
+VirtReg_GR32_NOREX2	7077
+VirtReg_GR32_NOREX2_NOSP	7078
+VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX	7079
+VirtReg_GR32_NOREX	7080
+VirtReg_VK32	7081
+VirtReg_GR32_NOREX_NOSP	7082
+VirtReg_RFP32	7083
+VirtReg_VK32WM	7084
+VirtReg_GR32_ABCD	7085
+VirtReg_GR32_TC	7086
+VirtReg_GR32_ABCD_and_GR32_TC	7087
+VirtReg_GR32_AD	7088
+VirtReg_GR32_ArgRef	7089
+VirtReg_GR32_BPSP	7090
+VirtReg_GR32_BSI	7091
+VirtReg_GR32_CB	7092
+VirtReg_GR32_DC	7093
+VirtReg_GR32_DIBP	7094
+VirtReg_GR32_SIDI	7095
+VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit	7096
+VirtReg_CCR	7097
+VirtReg_DFCCR	7098
+VirtReg_GR32_ABCD_and_GR32_BSI	7099
+VirtReg_GR32_AD_and_GR32_ArgRef	7100
+VirtReg_GR32_ArgRef_and_GR32_CB	7101
+VirtReg_GR32_BPSP_and_GR32_DIBP	7102
+VirtReg_GR32_BPSP_and_GR32_TC	7103
+VirtReg_GR32_BSI_and_GR32_SIDI	7104
+VirtReg_GR32_DIBP_and_GR32_SIDI	7105
+VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit	7106
+VirtReg_LOW32_ADDR_ACCESS_with_sub_32bit	7107
+VirtReg_RFP64	7108
+VirtReg_GR64	7109
+VirtReg_FR64X	7110
+VirtReg_GR64_with_sub_8bit	7111
+VirtReg_GR64_NOSP	7112
+VirtReg_GR64_NOREX2	7113
+VirtReg_CONTROL_REG	7114
+VirtReg_FR64	7115
+VirtReg_GR64_with_sub_16bit_in_GR16_NOREX2	7116
+VirtReg_GR64_NOREX2_NOSP	7117
+VirtReg_GR64PLTSafe	7118
+VirtReg_GR64_TC	7119
+VirtReg_GR64_NOREX	7120
+VirtReg_GR64_TCW64	7121
+VirtReg_GR64_TC_with_sub_8bit	7122
+VirtReg_GR64_NOREX2_NOSP_and_GR64_TC	7123
+VirtReg_GR64_TCW64_with_sub_8bit	7124
+VirtReg_GR64_TC_and_GR64_TCW64	7125
+VirtReg_GR64_with_sub_16bit_in_GR16_NOREX	7126
+VirtReg_VK64	7127
+VirtReg_VR64	7128
+VirtReg_GR64PLTSafe_and_GR64_TC	7129
+VirtReg_GR64_NOREX2_NOSP_and_GR64_TCW64	7130
+VirtReg_GR64_NOREX_NOSP	7131
+VirtReg_GR64_NOREX_and_GR64_TC	7132
+VirtReg_GR64_TCW64_and_GR64_TC_with_sub_8bit	7133
+VirtReg_VK64WM	7134
+VirtReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64	7135
+VirtReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX	7136
+VirtReg_GR64PLTSafe_and_GR64_TCW64	7137
+VirtReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC	7138
+VirtReg_GR64_NOREX_and_GR64_TCW64	7139
+VirtReg_GR64_ABCD	7140
+VirtReg_GR64_with_sub_32bit_in_GR32_TC	7141
+VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC	7142
+VirtReg_GR64_AD	7143
+VirtReg_GR64_ArgRef	7144
+VirtReg_GR64_and_LOW32_ADDR_ACCESS_RBP	7145
+VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef	7146
+VirtReg_GR64_with_sub_32bit_in_GR32_BPSP	7147
+VirtReg_GR64_with_sub_32bit_in_GR32_BSI	7148
+VirtReg_GR64_with_sub_32bit_in_GR32_CB	7149
+VirtReg_GR64_with_sub_32bit_in_GR32_DIBP	7150
+VirtReg_GR64_with_sub_32bit_in_GR32_SIDI	7151
+VirtReg_GR64_A	7152
+VirtReg_GR64_ArgRef_and_GR64_TC	7153
+VirtReg_GR64_and_LOW32_ADDR_ACCESS	7154
+VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI	7155
+VirtReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef	7156
+VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB	7157
+VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP	7158
+VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC	7159
+VirtReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI	7160
+VirtReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI	7161
+VirtReg_RST	7162
+VirtReg_RFP80	7163
+VirtReg_RFP80_7	7164
+VirtReg_VR128X	7165
+VirtReg_VR128	7166
+VirtReg_VR256X	7167
+VirtReg_VR256	7168
+VirtReg_VR512	7169
+VirtReg_VR512_0_15	7170
+VirtReg_TILE	7171
+VirtReg_TILEPAIR	7172
diff --git a/llvm/test/tools/llvm-ir2vec/triplets.mir b/llvm/test/tools/llvm-ir2vec/triplets.mir
new file mode 100644
index 0000000..274984a
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/triplets.mir
@@ -0,0 +1,61 @@
+# REQUIRES: x86_64-linux
+# RUN: llvm-ir2vec triplets --mode=mir %s -o 2>&1 %t1.log
+# RUN: diff %S/output/reference_triplets.txt %t1.log
+
+--- |
+  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64-unknown-linux-gnu"
+  
+  define dso_local noundef i32 @add_function(i32 noundef %a, i32 noundef %b) {
+  entry:
+    %sum = add nsw i32 %a, %b
+    ret i32 %sum
+  }
+  
+  define dso_local noundef i32 @mul_function(i32 noundef %x, i32 noundef %y) {
+  entry:
+    %product = mul nsw i32 %x, %y
+    ret i32 %product
+  }
+...
+---
+name:            add_function
+alignment:       16
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr32 }
+  - { id: 1, class: gr32 }
+  - { id: 2, class: gr32 }
+liveins:
+  - { reg: '$edi', virtual-reg: '%0' }
+  - { reg: '$esi', virtual-reg: '%1' }
+body:             |
+  bb.0.entry:
+    liveins: $edi, $esi
+  
+    %1:gr32 = COPY $esi
+    %0:gr32 = COPY $edi
+    %2:gr32 = nsw ADD32rr %0, %1, implicit-def dead $eflags
+    $eax = COPY %2
+    RET 0, $eax
+
+---
+name:            mul_function
+alignment:       16
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr32 }
+  - { id: 1, class: gr32 }
+  - { id: 2, class: gr32 }
+liveins:
+  - { reg: '$edi', virtual-reg: '%0' }
+  - { reg: '$esi', virtual-reg: '%1' }
+body:             |
+  bb.0.entry:
+    liveins: $edi, $esi
+  
+    %1:gr32 = COPY $esi
+    %0:gr32 = COPY $edi
+    %2:gr32 = nsw IMUL32rr %0, %1, implicit-def dead $eflags
+    $eax = COPY %2
+    RET 0, $eax
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s
index 49af4df..c20409e 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s
@@ -6864,7 +6864,7 @@ zip2	z31.s, z31.s, z31.s
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2.0]  [2.1]  [2.2]  [3]    [4.0]  [4.1]  [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   [14]
-# CHECK-NEXT:  -      -      -      -      -      -      -     245.00 651.00 651.00 570.50 272.50 83.75  83.75  81.75  81.75  1536.75 1281.75 794.25 748.25
+# CHECK-NEXT:  -      -      -      -      -      -      -     245.00 651.00 651.00 570.50 272.50 83.75  83.75  81.75  81.75  1540.75 1285.75 790.25 744.25
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2.0]  [2.1]  [2.2]  [3]    [4.0]  [4.1]  [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   [14]   Instructions:
@@ -9617,39 +9617,39 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     9.00   9.00    -      -      -      -      -      -     9.00   9.00    -      -     st4w	{ z21.s - z24.s }, p5, [x10, #20, mul vl]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     9.00   9.00    -      -      -      -      -      -     9.00   9.00    -      -     st4w	{ z23.s - z26.s }, p3, [x13, #-32, mul vl]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     9.00   9.00    -      -     4.50   4.50   4.50   4.50   9.00   9.00    -      -     st4w	{ z5.s - z8.s }, p3, [x17, x16, lsl #2]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1b	{ z0.b }, p0, [x0, x0]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1b	{ z0.b }, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1b	{ z0.b }, p0, [x0, x0]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1b	{ z0.b }, p0, [x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1b	{ z0.d }, p0, [z1.d]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -     2.00   2.00    -      -     stnt1b	{ z0.s }, p0, [z1.s]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1b	{ z21.b }, p5, [x10, #7, mul vl]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1b	{ z23.b }, p3, [x13, #-8, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1b	{ z21.b }, p5, [x10, #7, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1b	{ z23.b }, p3, [x13, #-8, mul vl]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1b	{ z31.d }, p7, [z31.d, x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1b	{ z31.d }, p7, [z31.d]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -     2.00   2.00    -      -     stnt1b	{ z31.s }, p7, [z31.s, x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -     2.00   2.00    -      -     stnt1b	{ z31.s }, p7, [z31.s]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1d	{ z0.d }, p0, [x0, x0, lsl #3]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1d	{ z0.d }, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1d	{ z0.d }, p0, [x0, x0, lsl #3]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1d	{ z0.d }, p0, [x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1d	{ z0.d }, p0, [z1.d]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1d	{ z21.d }, p5, [x10, #7, mul vl]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1d	{ z23.d }, p3, [x13, #-8, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1d	{ z21.d }, p5, [x10, #7, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1d	{ z23.d }, p3, [x13, #-8, mul vl]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1d	{ z31.d }, p7, [z31.d, x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1d	{ z31.d }, p7, [z31.d]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1h	{ z0.d }, p0, [z1.d]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.25   0.25   0.25   0.25   0.25   0.25   0.25   0.25   stnt1h	{ z0.h }, p0, [x0, x0, lsl #1]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1h	{ z0.h }, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.25   0.25   0.25   0.25   0.50   0.50    -      -     stnt1h	{ z0.h }, p0, [x0, x0, lsl #1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1h	{ z0.h }, p0, [x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -     2.00   2.00    -      -     stnt1h	{ z0.s }, p0, [z1.s]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1h	{ z21.h }, p5, [x10, #7, mul vl]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1h	{ z23.h }, p3, [x13, #-8, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1h	{ z21.h }, p5, [x10, #7, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1h	{ z23.h }, p3, [x13, #-8, mul vl]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1h	{ z31.d }, p7, [z31.d, x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1h	{ z31.d }, p7, [z31.d]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -     2.00   2.00    -      -     stnt1h	{ z31.s }, p7, [z31.s, x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -     2.00   2.00    -      -     stnt1h	{ z31.s }, p7, [z31.s]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1w	{ z0.d }, p0, [z1.d]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1w	{ z0.s }, p0, [x0, x0, lsl #2]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1w	{ z0.s }, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1w	{ z0.s }, p0, [x0, x0, lsl #2]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1w	{ z0.s }, p0, [x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -     2.00   2.00    -      -     stnt1w	{ z0.s }, p0, [z1.s]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1w	{ z21.s }, p5, [x10, #7, mul vl]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1w	{ z23.s }, p3, [x13, #-8, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1w	{ z21.s }, p5, [x10, #7, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1w	{ z23.s }, p3, [x13, #-8, mul vl]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1w	{ z31.d }, p7, [z31.d, x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1w	{ z31.d }, p7, [z31.d]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -     2.00   2.00    -      -     stnt1w	{ z31.s }, p7, [z31.s, x0]
diff --git a/llvm/test/tools/llvm-objdump/MachO/disassemble-source-dsym.test b/llvm/test/tools/llvm-objdump/MachO/disassemble-source-dsym.test
index aaaf6bf..9899dc5 100644
--- a/llvm/test/tools/llvm-objdump/MachO/disassemble-source-dsym.test
+++ b/llvm/test/tools/llvm-objdump/MachO/disassemble-source-dsym.test
@@ -13,4 +13,35 @@
 # RUN: dsymutil -f  -oso-prepend-path=%p/../../dsymutil/ %t3 -o %t3.dSYM
 # RUN: llvm-objdump --source --prefix=%p/../../dsymutil  %t3 | FileCheck --check-prefix=SOURCE %s
 
+## Test that --source works with --macho flag.
+
+## --macho w/ explicit .dSYM
+# RUN: llvm-objdump < %p/../../dsymutil/Inputs/basic.macho.x86_64 - --source --macho --dsym=%t1.dSYM --prefix=%p/../../dsymutil | \
+# RUN:   FileCheck --check-prefix=SOURCE %s
+
+## --macho w/ auto-detected .dSYM (dir)
+# RUN: llvm-objdump --source --macho --prefix=%p/../../dsymutil %t2 | FileCheck --check-prefix=SOURCE %s
+
+## --macho w/ auto-detected .dSYM (file)
+# RUN: llvm-objdump --source --macho --prefix=%p/../../dsymutil %t3 | FileCheck --check-prefix=SOURCE %s
+
 # SOURCE: ; int bar(int arg) {
+
+## Test that --line-numbers works with --macho flag.
+
+## --macho -l w/ explicit .dSYM
+# RUN: llvm-objdump -d -l --macho --dsym=%t1.dSYM %p/../../dsymutil/Inputs/basic.macho.x86_64 | FileCheck --check-prefix=LINE %s
+
+## --macho -l w/ object file (embedded debug info)
+# RUN: llvm-objdump -d -l --macho %p/../../dsymutil/Inputs/basic1.macho.x86_64.o | FileCheck --check-prefix=LINE_OBJ %s
+
+# LINE: (__TEXT,__text) section
+# LINE: _bar:
+# LINE: ; bar():
+# LINE: ; {{.*}}basic3.c:
+
+# LINE_OBJ: (__TEXT,__text) section
+# LINE_OBJ: _main:
+# LINE_OBJ: ; main():
+# LINE_OBJ: ; {{.*}}basic1.c:23
+# LINE_OBJ: pushq %rbp ## basic1.c:23:0
diff --git a/llvm/test/tools/llvm-profdata/input-wildcard.test b/llvm/test/tools/llvm-profdata/input-wildcard.test
new file mode 100644
index 0000000..f2c46c9
--- /dev/null
+++ b/llvm/test/tools/llvm-profdata/input-wildcard.test
@@ -0,0 +1,15 @@
+# This test verifies that llvm-profdata will do wildcard expansion on its
+# arguments. The expansion is done by Windows-specific support in InitLLVM, so
+# we only expect this to work on Windows hosts.
+# REQUIRES: system-windows
+
+# Create two files to glob.
+RUN: echo '# empty profile 1' >  %t.prof1.proftxt
+RUN: echo '# empty profile 2' >> %t.prof2.proftxt
+
+# Prevent LIT itself from globbing by quoting the wildcard argument.
+RUN: llvm-profdata merge "%t.*.proftxt" -dump-input-file-list -o /dev/null | FileCheck %s
+
+# Verify that llvm-profdata expanded the wildcard argument.
+CHECK: 1,{{.*}}.prof1.proftxt
+CHECK-NEXT: 1,{{.*}}.prof2.proftxt
diff --git a/llvm/test/tools/llvm-readobj/ELF/section-types.test b/llvm/test/tools/llvm-readobj/ELF/section-types.test
index 904892a..12a9d05 100644
--- a/llvm/test/tools/llvm-readobj/ELF/section-types.test
+++ b/llvm/test/tools/llvm-readobj/ELF/section-types.test
@@ -63,6 +63,8 @@
 # LLVM: Type: SHT_LLVM_PART_PHDR
 # LLVM: Name: .llvm.lto
 # LLVM: Type: SHT_LLVM_LTO
+# LLVM: Name: .llvm.callgraph
+# LLVM: Type: SHT_LLVM_CALL_GRAPH
 # LLVM: Name: gnu_sframe
 # LLVM: Type: SHT_GNU_SFRAME
 # LLVM: Name: gnu_attributes
@@ -127,6 +129,7 @@
 # GNU-NEXT: part1                   LLVM_PART_EHDR
 # GNU-NEXT: .phdrs                  LLVM_PART_PHDR
 # GNU-NEXT: .llvm.lto               LLVM_LTO
+# GNU-NEXT: .llvm.callgraph         LLVM_CALL_GRAPH
 # GNU-NEXT: gnu_sframe              SFRAME
 # GNU-NEXT: gnu_attributes          ATTRIBUTES
 # GNU-NEXT: gnu_hash                GNU_HASH
@@ -218,6 +221,8 @@ Sections:
     Type: SHT_LLVM_PART_PHDR
   - Name: .llvm.lto
     Type: SHT_LLVM_LTO
+  - Name: .llvm.callgraph
+    Type: SHT_LLVM_CALL_GRAPH
   - Name: gnu_sframe
     Type: SHT_GNU_SFRAME
   - Name: gnu_attributes
diff --git a/llvm/test/tools/opt/no-target-machine.ll b/llvm/test/tools/opt/no-target-machine.ll
new file mode 100644
index 0000000..4f07c81
--- /dev/null
+++ b/llvm/test/tools/opt/no-target-machine.ll
@@ -0,0 +1,18 @@
+; Report error when pass requires TargetMachine.
+; RUN: not opt -passes=atomic-expand -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes=codegenprepare -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes=complex-deinterleaving -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes=dwarf-eh-prepare -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes=expand-large-div-rem -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes=expand-memcmp -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes=indirectbr-expand -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes=interleaved-access -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes=interleaved-load-combine -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes=safe-stack -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes=select-optimize -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes=stack-protector -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes=typepromotion -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes='expand-fp<O1>' -disable-output %s 2>&1 | FileCheck %s
+define void @foo() { ret void }
+; CHECK: pass '{{.+}}' requires TargetMachine
+;requires TargetMachine