28 files changed, 3407 insertions, 494 deletions
diff --git a/llvm/test/Transforms/DeadStoreElimination/zeroed-missing.ll b/llvm/test/Transforms/DeadStoreElimination/zeroed-missing.ll
index e390d4b..303afc20 100644
--- a/llvm/test/Transforms/DeadStoreElimination/zeroed-missing.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/zeroed-missing.ll
@@ -12,6 +12,6 @@ define ptr @undeclared_customalloc(i64 %size, i64 %align) {
   ret ptr %call
 }
 
-declare ptr @customalloc2(i64, i64) allockind("alloc") "alloc-family"="customalloc2" "alloc-variant-zeroed"="customalloc2_zeroed"
+declare ptr @customalloc2(i64, i64) allockind("alloc,uninitialized") "alloc-family"="customalloc2" "alloc-variant-zeroed"="customalloc2_zeroed"
 ; CHECK-DAG: declare ptr @customalloc2_zeroed(i64, i64) #[[CA2ATTR:[0-9]+]]
 ; CHECK-DAG: attributes #[[CA2ATTR]] = { allockind("alloc,zeroed") "alloc-family"="customalloc2" }
diff --git a/llvm/test/Transforms/Inline/drop-callee-type-metadata.ll b/llvm/test/Transforms/Inline/drop-callee-type-metadata.ll
new file mode 100644
index 0000000..5475880
--- /dev/null
+++ b/llvm/test/Transforms/Inline/drop-callee-type-metadata.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+;; Test if the callee_type metadata is dropped when it is
+;; is mapped to a direct function call from an indirect call during inlining.
+
+; RUN: opt -passes=inline -S < %s | FileCheck %s
+
+define i32 @_Z13call_indirectPFicEc(ptr %func, i8 %x) !type !0 {
+; CHECK-LABEL: define i32 @_Z13call_indirectPFicEc(
+; CHECK-SAME: ptr [[FUNC:%.*]], i8 [[X:%.*]]) !type [[META0:![0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 [[FUNC]](i8 [[X]]), !callee_type [[META1:![0-9]+]]
+; CHECK-NEXT:    ret i32 [[CALL]]
+;
+entry:
+  %call = call i32 %func(i8 %x), !callee_type !1
+  ret i32 %call
+}
+
+define i32 @_Z3barv() !type !3 {
+; CHECK-LABEL: define i32 @_Z3barv(
+; CHECK-SAME: ) !type [[META3:![0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CALL_I:%.*]] = call i32 @_Z3fooc(i8 97)
+; CHECK-NEXT:    ret i32 [[CALL_I]]
+;
+entry:
+  %call = call i32 @_Z13call_indirectPFicEc(ptr nonnull @_Z3fooc, i8 97)
+  ret i32 %call
+}
+declare !type !2 i32 @_Z3fooc(i8 signext)
+
+!0 = !{i64 0, !"_ZTSFiPvcE.generalized"}
+!1 = !{!2}
+!2 = !{i64 0, !"_ZTSFicE.generalized"}
+!3 = !{i64 0, !"_ZTSFivE.generalized"}
+;.
+; CHECK: [[META0]] = !{i64 0, !"_ZTSFiPvcE.generalized"}
+; CHECK: [[META1]] = !{[[META2:![0-9]+]]}
+; CHECK: [[META2]] = !{i64 0, !"_ZTSFicE.generalized"}
+; CHECK: [[META3]] = !{i64 0, !"_ZTSFivE.generalized"}
+;.
diff --git a/llvm/test/Transforms/Inline/memprof_inline2.ll b/llvm/test/Transforms/Inline/memprof_inline2.ll
index 21448f1..d2e3927 100644
--- a/llvm/test/Transforms/Inline/memprof_inline2.ll
+++ b/llvm/test/Transforms/Inline/memprof_inline2.ll
@@ -38,6 +38,9 @@
 ;; }
 
 ; RUN: opt -passes=inline %s -S | FileCheck %s
+;; We should not perform additional discarding of non-cold contexts when
+;; rebuilding the tries after inlining, even with a very low threshold.
+; RUN: opt -passes=inline -memprof-callsite-cold-threshold=1 %s -S | FileCheck %s
 
 ; ModuleID = 'memprof_inline2.cc'
 source_filename = "memprof_inline2.cc"
diff --git a/llvm/test/Transforms/Inline/memprof_inline3.ll b/llvm/test/Transforms/Inline/memprof_inline3.ll
new file mode 100644
index 0000000..e802f2b
--- /dev/null
+++ b/llvm/test/Transforms/Inline/memprof_inline3.ll
@@ -0,0 +1,296 @@
+;; This test is the same code as memprof_inline2.ll, except that it has
+;; manually synthesized context size information. This test ensures that we
+;; don't attempt to apply -memprof-callsite-cold-threshold again when
+;; rebuilding the metadata after inlining.
+;
+; RUN: opt -passes=inline %s -S | FileCheck %s
+;; We should not perform additional discarding of non-cold contexts when
+;; rebuilding the tries after inlining, even with a very low threshold.
+; RUN: opt -passes=inline -memprof-callsite-cold-threshold=0 %s -S | FileCheck %s
+
+; ModuleID = 'memprof_inline2.cc'
+source_filename = "memprof_inline2.cc"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: mustprogress uwtable
+; CHECK-LABEL: define dso_local noundef ptr @_Z3foov
+define dso_local noundef ptr @_Z3foov() #0 !dbg !39 {
+entry:
+  ;; We should keep the original memprof metadata intact.
+  ; CHECK: call {{.*}} @_Znam{{.*}} !memprof ![[ORIGMEMPROF:[0-9]+]]
+  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7, !dbg !42, !memprof !43, !callsite !52
+  ret ptr %call, !dbg !53
+}
+
+; Function Attrs: nobuiltin allocsize(0)
+declare noundef nonnull ptr @_Znam(i64 noundef) #1
+
+;; Mark noinline so we don't inline into calls from bar and baz. We should end
+;; up with a memprof metadata on the call to foo below.
+; Function Attrs: mustprogress noinline uwtable
+; CHECK-LABEL: define dso_local noundef ptr @_Z4foo2v
+define dso_local noundef ptr @_Z4foo2v() #2 !dbg !54 {
+entry:
+  ;; We should have memprof metadata for the call stacks from bar and baz,
+  ;; and the callsite metadata should be the concatentation of the id from the
+  ;; inlined call to new and the original callsite.
+  ; CHECK: call {{.*}} @_Znam{{.*}} !memprof ![[NEWMEMPROF:[0-9]+]], !callsite ![[NEWCALLSITE:[0-9]+]]
+  %call = call noundef ptr @_Z3foov(), !dbg !55, !callsite !56
+  ret ptr %call, !dbg !57
+}
+
+; Function Attrs: mustprogress uwtable
+define dso_local noundef ptr @_Z3barv() #0 !dbg !58 {
+entry:
+  %call = call noundef ptr @_Z4foo2v(), !dbg !59, !callsite !60
+  ret ptr %call, !dbg !61
+}
+
+; Function Attrs: mustprogress uwtable
+define dso_local noundef ptr @_Z3bazv() #0 !dbg !62 {
+entry:
+  %call = call noundef ptr @_Z4foo2v(), !dbg !63, !callsite !64
+  ret ptr %call, !dbg !65
+}
+
+;; Make sure we don't propagate any memprof/callsite metadata
+; Function Attrs: mustprogress uwtable
+; CHECK-LABEL: define dso_local noundef ptr @notprofiled
+define dso_local noundef ptr @notprofiled() #0 !dbg !66 {
+entry:
+  ;; When foo is inlined, both the memprof and callsite metadata should be
+  ;; stripped from the inlined call to new, as there is no callsite metadata on
+  ;; the call.
+  ; CHECK: call {{.*}} @_Znam
+  ; CHECK-NOT: !memprof
+  ; CHECK-NOT: !callsite
+  %call = call noundef ptr @_Z3foov(), !dbg !67
+  ;; When baz is inlined, the callsite metadata should be stripped from the
+  ;; inlined call to foo2, as there is no callsite metadata on the call.
+  ; CHECK: call {{.*}} @_Z4foo2v
+  ; CHECK-NOT: !callsite
+  %call2 = call noundef ptr @_Z3bazv()
+  ; CHECK-NEXT: ret
+  ret ptr %call, !dbg !68
+}
+
+; Function Attrs: mustprogress noinline norecurse optnone uwtable
+define dso_local noundef i32 @main(i32 noundef %argc, ptr noundef %argv) #3 !dbg !69 {
+entry:
+  %retval = alloca i32, align 4
+  %argc.addr = alloca i32, align 4
+  %argv.addr = alloca ptr, align 8
+  %c = alloca ptr, align 8
+  %d = alloca ptr, align 8
+  %e = alloca ptr, align 8
+  %f = alloca ptr, align 8
+  store i32 0, ptr %retval, align 4
+  store i32 %argc, ptr %argc.addr, align 4
+  store ptr %argv, ptr %argv.addr, align 8
+  ;; The below 4 callsites are all annotated as noinline
+  %call = call noundef ptr @_Z3foov() #8, !dbg !70, !callsite !71
+  store ptr %call, ptr %c, align 8, !dbg !72
+  %call1 = call noundef ptr @_Z3foov() #8, !dbg !73, !callsite !74
+  store ptr %call1, ptr %d, align 8, !dbg !75
+  %call2 = call noundef ptr @_Z3barv() #8, !dbg !76, !callsite !77
+  store ptr %call2, ptr %e, align 8, !dbg !78
+  %call3 = call noundef ptr @_Z3bazv() #8, !dbg !79, !callsite !80
+  store ptr %call3, ptr %f, align 8, !dbg !81
+  %0 = load ptr, ptr %c, align 8, !dbg !82
+  call void @llvm.memset.p0.i64(ptr align 1 %0, i8 0, i64 10, i1 false), !dbg !83
+  %1 = load ptr, ptr %d, align 8, !dbg !84
+  call void @llvm.memset.p0.i64(ptr align 1 %1, i8 0, i64 10, i1 false), !dbg !85
+  %2 = load ptr, ptr %e, align 8, !dbg !86
+  call void @llvm.memset.p0.i64(ptr align 1 %2, i8 0, i64 10, i1 false), !dbg !87
+  %3 = load ptr, ptr %f, align 8, !dbg !88
+  call void @llvm.memset.p0.i64(ptr align 1 %3, i8 0, i64 10, i1 false), !dbg !89
+  %4 = load ptr, ptr %c, align 8, !dbg !90
+  %isnull = icmp eq ptr %4, null, !dbg !91
+  br i1 %isnull, label %delete.end, label %delete.notnull, !dbg !91
+
+delete.notnull:                                   ; preds = %entry
+  call void @_ZdaPv(ptr noundef %4) #9, !dbg !92
+  br label %delete.end, !dbg !92
+
+delete.end:                                       ; preds = %delete.notnull, %entry
+  %call4 = call i32 @sleep(i32 noundef 200), !dbg !94
+  %5 = load ptr, ptr %d, align 8, !dbg !95
+  %isnull5 = icmp eq ptr %5, null, !dbg !96
+  br i1 %isnull5, label %delete.end7, label %delete.notnull6, !dbg !96
+
+delete.notnull6:                                  ; preds = %delete.end
+  call void @_ZdaPv(ptr noundef %5) #9, !dbg !97
+  br label %delete.end7, !dbg !97
+
+delete.end7:                                      ; preds = %delete.notnull6, %delete.end
+  %6 = load ptr, ptr %e, align 8, !dbg !98
+  %isnull8 = icmp eq ptr %6, null, !dbg !99
+  br i1 %isnull8, label %delete.end10, label %delete.notnull9, !dbg !99
+
+delete.notnull9:                                  ; preds = %delete.end7
+  call void @_ZdaPv(ptr noundef %6) #9, !dbg !100
+  br label %delete.end10, !dbg !100
+
+delete.end10:                                     ; preds = %delete.notnull9, %delete.end7
+  %7 = load ptr, ptr %f, align 8, !dbg !101
+  %isnull11 = icmp eq ptr %7, null, !dbg !102
+  br i1 %isnull11, label %delete.end13, label %delete.notnull12, !dbg !102
+
+delete.notnull12:                                 ; preds = %delete.end10
+  call void @_ZdaPv(ptr noundef %7) #9, !dbg !103
+  br label %delete.end13, !dbg !103
+
+delete.end13:                                     ; preds = %delete.notnull12, %delete.end10
+  ret i32 0, !dbg !104
+}
+
+; Function Attrs: argmemonly nofree nounwind willreturn writeonly
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #4
+
+; Function Attrs: nobuiltin nounwind
+declare void @_ZdaPv(ptr noundef) #5
+
+declare i32 @sleep(i32 noundef) #6
+
+attributes #0 = { mustprogress uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { nobuiltin allocsize(0) "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #2 = { mustprogress noinline uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #3 = { mustprogress noinline norecurse optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #4 = { argmemonly nofree nounwind willreturn writeonly }
+attributes #5 = { nobuiltin nounwind "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #6 = { "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #7 = { builtin allocsize(0) }
+attributes #8 = { noinline }
+attributes #9 = { builtin nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!llvm.ident = !{!38}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 15.0.0 (https://github.com/llvm/llvm-project.git e09c924f98ec157adeaa74819b0aec9a07a1b552)", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
+!1 = !DIFile(filename: "memprof_inline.cc", directory: "/usr/local/google/home/tejohnson/llvm/tmp", checksumkind: CSK_MD5, checksum: "8711f6fd269e6cb5611fef48bc906eab")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 7, !"PIC Level", i32 2}
+!6 = !{i32 7, !"PIE Level", i32 2}
+!7 = !{i32 7, !"uwtable", i32 2}
+!8 = !{i32 7, !"frame-pointer", i32 2}
+!9 = !{i32 1, !"ProfileSummary", !10}
+!10 = !{!11, !12, !13, !14, !15, !16, !17, !18, !19, !20}
+!11 = !{!"ProfileFormat", !"InstrProf"}
+!12 = !{!"TotalCount", i64 0}
+!13 = !{!"MaxCount", i64 0}
+!14 = !{!"MaxInternalCount", i64 0}
+!15 = !{!"MaxFunctionCount", i64 0}
+!16 = !{!"NumCounts", i64 0}
+!17 = !{!"NumFunctions", i64 0}
+!18 = !{!"IsPartialProfile", i64 0}
+!19 = !{!"PartialProfileRatio", double 0.000000e+00}
+!20 = !{!"DetailedSummary", !21}
+!21 = !{!22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37}
+!22 = !{i32 10000, i64 0, i32 0}
+!23 = !{i32 100000, i64 0, i32 0}
+!24 = !{i32 200000, i64 0, i32 0}
+!25 = !{i32 300000, i64 0, i32 0}
+!26 = !{i32 400000, i64 0, i32 0}
+!27 = !{i32 500000, i64 0, i32 0}
+!28 = !{i32 600000, i64 0, i32 0}
+!29 = !{i32 700000, i64 0, i32 0}
+!30 = !{i32 800000, i64 0, i32 0}
+!31 = !{i32 900000, i64 0, i32 0}
+!32 = !{i32 950000, i64 0, i32 0}
+!33 = !{i32 990000, i64 0, i32 0}
+!34 = !{i32 999000, i64 0, i32 0}
+!35 = !{i32 999900, i64 0, i32 0}
+!36 = !{i32 999990, i64 0, i32 0}
+!37 = !{i32 999999, i64 0, i32 0}
+!38 = !{!"clang version 15.0.0 (https://github.com/llvm/llvm-project.git e09c924f98ec157adeaa74819b0aec9a07a1b552)"}
+!39 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 4, type: !40, scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !41)
+!40 = !DISubroutineType(types: !41)
+!41 = !{}
+!42 = !DILocation(line: 5, column: 10, scope: !39)
+;; The first 2 are from the direct calls to foo from main. Those stay on the
+;; callsite in foo, which isn't inlined into main due to the callsites in main
+;; being annotated as noinline.
+;; The second 2 are from the calls from foo2, which inlines its callsite to foo
+;; but is not itself inlined into its callers. Therefore they get moved to a
+;; new memprof metadata within foo2.
+!43 = !{!44, !46, !48, !50}
+!44 = !{!45, !"cold", !105}
+!105 = !{i64 123, i64 5000}
+!45 = !{i64 -2458008693472584243, i64 7394638144382192936}
+!46 = !{!47, !"notcold", !106}
+!47 = !{i64 -2458008693472584243, i64 -8908997186479157179}
+!106 = !{i64 345, i64 1}
+!48 = !{!49, !"notcold", !107}
+!49 = !{i64 -2458008693472584243, i64 -8079659623765193173, i64 -4805294506621015872}
+!107 = !{i64 678, i64 1}
+!50 = !{!51, !"cold", !108}
+!51 = !{i64 -2458008693472584243, i64 -8079659623765193173, i64 -972865200055133905}
+!108 = !{i64 234, i64 5000}
+; CHECK: ![[ORIGMEMPROF]] = !{![[ORIGMIB1:[0-9]+]], ![[ORIGMIB2:[0-9]+]], ![[ORIGMIB3:[0-9]+]], ![[ORIGMIB4:[0-9]+]]}
+; CHECK: ![[ORIGMIB1]] = !{![[ORIGMIBSTACK1:[0-9]+]], !"cold"
+; CHECK: ![[ORIGMIBSTACK1]] = !{i64 -2458008693472584243, i64 7394638144382192936}
+; CHECK: ![[ORIGMIB2]] = !{![[ORIGMIBSTACK2:[0-9]+]], !"notcold"
+; CHECK: ![[ORIGMIBSTACK2]] = !{i64 -2458008693472584243, i64 -8908997186479157179}
+; CHECK: ![[ORIGMIB3]] = !{![[ORIGMIBSTACK3:[0-9]+]], !"notcold"
+; CHECK: ![[ORIGMIBSTACK3]] = !{i64 -2458008693472584243, i64 -8079659623765193173, i64 -4805294506621015872}
+; CHECK: ![[ORIGMIB4]] = !{![[ORIGMIBSTACK4:[0-9]+]], !"cold"
+; CHECK: ![[ORIGMIBSTACK4]] = !{i64 -2458008693472584243, i64 -8079659623765193173, i64 -972865200055133905}
+; CHECK: ![[NEWMEMPROF]] = !{![[ORIGMIB3:[0-9]+]], ![[ORIGMIB4:[0-9]+]]}
+; CHECK: ![[NEWCALLSITE]] = !{i64 -2458008693472584243, i64 -8079659623765193173}
+!52 = !{i64 -2458008693472584243}
+!53 = !DILocation(line: 5, column: 3, scope: !39)
+!54 = distinct !DISubprogram(name: "foo2", linkageName: "_Z4foo2v", scope: !1, file: !1, line: 7, type: !40, scopeLine: 7, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !41)
+!55 = !DILocation(line: 8, column: 10, scope: !54)
+!56 = !{i64 -8079659623765193173}
+!57 = !DILocation(line: 8, column: 3, scope: !54)
+!58 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !1, file: !1, line: 10, type: !40, scopeLine: 10, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !41)
+!59 = !DILocation(line: 11, column: 10, scope: !58)
+!60 = !{i64 -972865200055133905}
+!61 = !DILocation(line: 11, column: 3, scope: !58)
+!62 = distinct !DISubprogram(name: "baz", linkageName: "_Z3bazv", scope: !1, file: !1, line: 13, type: !40, scopeLine: 13, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !41)
+!63 = !DILocation(line: 14, column: 10, scope: !62)
+!64 = !{i64 -4805294506621015872}
+!65 = !DILocation(line: 14, column: 3, scope: !62)
+!66 = distinct !DISubprogram(name: "notprofiled", linkageName: "notprofiled", scope: !1, file: !1, line: 400, type: !40, scopeLine: 400, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !41)
+!67 = !DILocation(line: 401, column: 10, scope: !66)
+!68 = !DILocation(line: 401, column: 3, scope: !66)
+!69 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 16, type: !40, scopeLine: 16, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !41)
+!70 = !DILocation(line: 17, column: 13, scope: !69)
+!71 = !{i64 -8908997186479157179}
+!72 = !DILocation(line: 17, column: 9, scope: !69)
+!73 = !DILocation(line: 18, column: 13, scope: !69)
+!74 = !{i64 7394638144382192936}
+!75 = !DILocation(line: 18, column: 9, scope: !69)
+!76 = !DILocation(line: 19, column: 13, scope: !69)
+!77 = !{i64 -5510257407004945023}
+!78 = !DILocation(line: 19, column: 9, scope: !69)
+!79 = !DILocation(line: 20, column: 13, scope: !69)
+!80 = !{i64 8771588133652501463}
+!81 = !DILocation(line: 20, column: 9, scope: !69)
+!82 = !DILocation(line: 21, column: 10, scope: !69)
+!83 = !DILocation(line: 21, column: 3, scope: !69)
+!84 = !DILocation(line: 22, column: 10, scope: !69)
+!85 = !DILocation(line: 22, column: 3, scope: !69)
+!86 = !DILocation(line: 23, column: 10, scope: !69)
+!87 = !DILocation(line: 23, column: 3, scope: !69)
+!88 = !DILocation(line: 24, column: 10, scope: !69)
+!89 = !DILocation(line: 24, column: 3, scope: !69)
+!90 = !DILocation(line: 25, column: 12, scope: !69)
+!91 = !DILocation(line: 25, column: 3, scope: !69)
+!92 = !DILocation(line: 25, column: 3, scope: !93)
+!93 = !DILexicalBlockFile(scope: !69, file: !1, discriminator: 2)
+!94 = !DILocation(line: 26, column: 3, scope: !69)
+!95 = !DILocation(line: 27, column: 12, scope: !69)
+!96 = !DILocation(line: 27, column: 3, scope: !69)
+!97 = !DILocation(line: 27, column: 3, scope: !93)
+!98 = !DILocation(line: 28, column: 12, scope: !69)
+!99 = !DILocation(line: 28, column: 3, scope: !69)
+!100 = !DILocation(line: 28, column: 3, scope: !93)
+!101 = !DILocation(line: 29, column: 12, scope: !69)
+!102 = !DILocation(line: 29, column: 3, scope: !69)
+!103 = !DILocation(line: 29, column: 3, scope: !93)
+!104 = !DILocation(line: 30, column: 3, scope: !69)
diff --git a/llvm/test/Transforms/InstCombine/drop-callee-type-metadata.ll b/llvm/test/Transforms/InstCombine/drop-callee-type-metadata.ll
new file mode 100644
index 0000000..83215f7
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/drop-callee-type-metadata.ll
@@ -0,0 +1,25 @@
+;; Test if the callee_type metadata is dropped when it is attached
+;; to a direct function call during instcombine.
+
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+
+define i32 @_Z3barv() !type !0 {
+; CHECK-LABEL: define i32 @_Z3barv(
+; CHECK-SAME: ) !type [[META0:![0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @_Z3fooc(i8 97){{$}}
+; CHECK-NEXT:    ret i32 [[CALL]]
+;
+entry:
+  %call = call i32 @_Z3fooc(i8 97), !callee_type !1
+  ret i32 %call
+}
+
+declare !type !2 i32 @_Z3fooc(i8 signext)
+
+!0 = !{i64 0, !"_ZTSFivE.generalized"}
+!1 = !{!2}
+!2 = !{i64 0, !"_ZTSFicE.generalized"}
+;.
+; CHECK: [[META0]] = !{i64 0, !"_ZTSFivE.generalized"}
+;.
diff --git a/llvm/test/Transforms/InstCombine/fold-fcmp-trunc.ll b/llvm/test/Transforms/InstCombine/fold-fcmp-trunc.ll
new file mode 100644
index 0000000..371f9b6
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/fold-fcmp-trunc.ll
@@ -0,0 +1,674 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+
+
+define i1 @fcmp_trunc(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp oge double [[TMP0]], 0x4058FFFFF0000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp oge float %trunc, 1.000000e+02
+  ret i1 %result
+}
+
+define i1 @fcmp_trunc_ult(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_ult(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ult double [[TMP0]], 0x4068FFFFF0000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp ult float %trunc, 2.000000e+02
+  ret i1 %result
+}
+
+define i1 @fcmp_trunc_ole(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_ole(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ole double [[TMP0]], 0x4072C00010000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp ole float %trunc, 3.000000e+02
+  ret i1 %result
+}
+
+define i1 @fcmp_trunc_ogt(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_ogt(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ogt double [[TMP0]], 0x4079000010000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp ogt float %trunc, 4.000000e+02
+  ret i1 %result
+}
+
+define i1 @fcmp_trunc_zero(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_zero(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp oge double [[TMP0]], 0xB690000000000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp oge float %trunc, 0.000000
+  ret i1 %result
+}
+
+define i1 @fcmp_trunc_with_nnan(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_with_nnan(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp nnan oge double [[TMP0]], 0x4058FFFFF0000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp nnan oge float %trunc, 1.000000e+02
+  ret i1 %result
+}
+
+define i1 @fcmp_trunc_with_ninf(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_with_ninf(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ninf oge double [[TMP0]], 0x4058FFFFF0000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp ninf oge float %trunc, 1.000000e+02
+  ret i1 %result
+}
+
+define i1 @fcmp_trunc_with_nsz(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_with_nsz(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp nsz oge double [[TMP0]], 0x4058FFFFF0000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp nsz oge float %trunc, 1.000000e+02
+  ret i1 %result
+}
+
+define i1 @fcmp_trunc_with_reassoc(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_with_reassoc(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp reassoc oge double [[TMP0]], 0x4058FFFFF0000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp reassoc oge float %trunc, 1.000000e+02
+  ret i1 %result
+}
+
+define i1 @fcmp_trunc_with_fast(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_with_fast(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp fast oge double [[TMP0]], 0x4058FFFFF0000000 
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp fast oge float %trunc, 1.000000e+02
+  ret i1 %result
+}
+
+define <4 x i1> @fcmp_vec_trunc(<4 x double> %0) {
+; CHECK-LABEL: define <4 x i1> @fcmp_vec_trunc(
+; CHECK-SAME: <4 x double> [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt <4 x double> [[TMP0]], splat (double 0x3FEFFFFFF0000000)
+; CHECK-NEXT:    ret <4 x i1> [[CMP]]
+;
+  %vec = fptrunc <4 x double> %0 to <4 x float>
+  %cmp = fcmp olt <4 x float> %vec, <float 1.0, float 1.0, float 1.0, float 1.0>
+  ret <4 x i1> %cmp
+}
+
+define <1 x i1> @fcmp_vec_trunc_scalar(<1 x double> %0) {
+; CHECK-LABEL: define <1 x i1> @fcmp_vec_trunc_scalar(
+; CHECK-SAME: <1 x double> [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp fast olt <1 x double> [[TMP0]], splat (double 0x3FEFFFFFF0000000)
+; CHECK-NEXT:    ret <1 x i1> [[CMP]]
+;
+  %vec = fptrunc <1 x double> %0 to <1 x float>
+  %cmp = fcmp fast olt <1 x float> %vec, <float 1.0>
+  ret <1 x i1> %cmp
+}
+
+define i1 @fcmp_trunc_fp128(fp128 %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_fp128(
+; CHECK-SAME: fp128 [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp fast oge fp128 [[TMP0]], 0xL000000000000000040058FFFFF000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc fp128 %0 to float
+  %result = fcmp fast oge float %trunc, 1.000000e+02
+  ret i1 %result
+}
+
+define i1 @fcmp_trunc_x86_fp80(x86_fp80 %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_x86_fp80(
+; CHECK-SAME: x86_fp80 [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp fast oge x86_fp80 [[TMP0]], 0xK4005C7FFFF8000000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc x86_fp80 %0 to float
+  %result = fcmp fast oge float %trunc, 1.000000e+02
+  ret i1 %result
+}
+
+define i1 @fcmp_trunc_ppc_fp128(ppc_fp128 %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_ppc_fp128(
+; CHECK-SAME: ppc_fp128 [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp fast oge ppc_fp128 [[TMP0]], 0xM4058FFFFF00000000000000000000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc ppc_fp128 %0 to float
+  %result = fcmp fast oge float %trunc, 1.000000e+02
+  ret i1 %result
+}
+
+define i1 @fcmp_trunc_nan(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_nan(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    ret i1 false
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp oge float %trunc, 0x7FF8000000000000
+  ret i1 %result
+}
+
+; denomalized 0x00000001
+define i1 @fcmp_trunc_d1(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_d1(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp oge double [[TMP0]], 0x3690000000000001
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp oge float %trunc, 1.40129846432481707092372958328991613128026194187651577175706828388979108268586060148663818836212158203125e-45
+  ret i1 %result
+}
+
+; denomalized 0x00000001 ole
+define i1 @fcmp_trunc_d1_ole(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_d1_ole(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ole double [[TMP0]], 0x36A7FFFFFFFFFFFF
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp ole float %trunc, 1.40129846432481707092372958328991613128026194187651577175706828388979108268586060148663818836212158203125e-45
+  ret i1 %result
+}
+
+; denomalized 0x00000002
+define i1 @fcmp_trunc_d2(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_d2(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp oge double [[TMP0]], 0x36A8000000000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp oge float %trunc, 2.8025969286496341418474591665798322625605238837530315435141365677795821653717212029732763767242431640625e-45
+  ret i1 %result
+}
+
+; denomalized 0x7fffff
+define i1 @fcmp_trunc_d3(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_d3(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ogt double [[TMP0]], 0x380FFFFFDFFFFFFF
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp ogt float %trunc, 1.175494210692441075487029444849287348827052428745893333857174530571588870475618904265502351336181163787841796875e-38
+  ret i1 %result
+}
+
+; denomalized 0x80000001
+define i1 @fcmp_trunc_d4(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_d4(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ogt double [[TMP0]], 0xB690000000000001
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp ogt float %trunc, -1.40129846432481707092372958328991613128026194187651577175706828388979108268586060148663818836212158203125e-45
+  ret i1 %result
+}
+
+; denomalized 0x80000001
+define i1 @fcmp_trunc_d5(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_d5(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp olt double [[TMP0]], 0xB80FFFFFDFFFFFFF
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp olt float %trunc, -1.175494210692441075487029444849287348827052428745893333857174530571588870475618904265502351336181163787841796875e-38
+  ret i1 %result
+}
+
+
+; +0
+define i1 @fcmp_trunc_p0(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_p0(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp oge double [[TMP0]], 0xB690000000000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp oge float %trunc, 0x00000000
+  ret i1 %result
+}
+
+
+; -0
+define i1 @fcmp_trunc_n0(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_n0(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ogt double [[TMP0]], 0x3690000000000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp ogt float %trunc, 0x8000000000000000
+  ret i1 %result
+}
+
+
+; max representable
+define i1 @fcmp_trunc_mx(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_mx(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ogt double [[TMP0]], 0x47EFFFFFEFFFFFFF
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp ogt float %trunc, 0x47EFFFFFE0000000
+  ret i1 %result
+}
+
+; negative max representable
+define i1 @fcmp_trunc_mn(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_mn(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp olt double [[TMP0]], 0xC7EFFFFFEFFFFFFF
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp olt float %trunc, -3.4028234663852885981170418348451692544e38
+  ret i1 %result
+}
+
+
+define i1 @fcmp_trunc_literal_nan(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_literal_nan(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    ret i1 false
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp oge float %trunc, 0x7FF8000000000000
+  ret i1 %result
+}
+
+define i1 @fcmp_trunc_literal_positive_inf(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_literal_positive_inf(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[TRUNC:%.*]] = fptrunc double [[TMP0]] to float
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp oeq float [[TRUNC]], 0x7FF0000000000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp oge float %trunc, 0x7FF0000000000000
+  ret i1 %result
+}
+
+
+define i1 @fcmp_trunc_literal_negative_inf(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_literal_negative_inf(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[TRUNC:%.*]] = fptrunc double [[TMP0]] to float
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp uno float [[TRUNC]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp ult float %trunc, 0xFFF0000000000000
+  ret i1 %result
+}
+
+
+define i1 @fcmp_trunc_nan_ugt(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_nan_ugt(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    ret i1 true
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp ugt float %trunc, 0x7FF8000000000000
+  ret i1 %result
+}
+
+define i1 @fcmp_trunc_inf_uge(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_inf_uge(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[TRUNC:%.*]] = fptrunc double [[TMP0]] to float
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ueq float [[TRUNC]], 0x7FF0000000000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp uge float %trunc, 0x7FF0000000000000
+  ret i1 %result
+}
+
+
+define i1 @fcmp_trunc_ninf_olt(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_ninf_olt(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    ret i1 false
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp olt float %trunc, 0xFFF0000000000000
+  ret i1 %result
+}
+
+
+define i1 @fcmp_trunc_uge(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_uge(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp uge double [[TMP0]], 0x405EBFFFF0000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp uge float %trunc, 123.0
+  ret i1 %result
+}
+
+
+define i1 @fcmp_trunc_neg_uge(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_neg_uge(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp uge double [[TMP0]], 0xC05EC00010000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp uge float %trunc, -123.0
+  ret i1 %result
+}
+
+
+define i1 @fcmp_trunc_oge(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_oge(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp oge double [[TMP0]], 0x405EBFFFF0000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp oge float %trunc, 123.0
+  ret i1 %result
+}
+
+
+define i1 @fcmp_trunc_neg_oge(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_neg_oge(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp oge double [[TMP0]], 0xC05EC00010000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp oge float %trunc, -123.0
+  ret i1 %result
+}
+
+
+define i1 @fcmp_trunc_ugt(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_ugt(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ugt double [[TMP0]], 0x40FE0F3010000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp ugt float %trunc, 123123.0
+  ret i1 %result
+}
+
+
+define i1 @fcmp_trunc_neg_ugt(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_neg_ugt(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ugt double [[TMP0]], 0xC0FE1B8FF0000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp ugt float %trunc, -123321.0
+  ret i1 %result
+}
+
+
+define i1 @fcmp_trunc_neg_ogt(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_neg_ogt(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ogt double [[TMP0]], 0xC0FE1B8FF0000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp ogt float %trunc, -123321.0
+  ret i1 %result
+}
+
+
+
+define i1 @fcmp_trunc_ule(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_ule(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ule double [[TMP0]], 0x408ED80010000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp ule float %trunc, 987.0
+  ret i1 %result
+}
+
+
+
+define i1 @fcmp_trunc_neg_ule(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_neg_ule(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ule double [[TMP0]], 0xC088A7FFF0000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp ule float %trunc, -789.0
+  ret i1 %result
+}
+
+
+
+define i1 @fcmp_trunc_neg_ole(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_neg_ole(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ole double [[TMP0]], 0xC088A7FFF0000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp ole float %trunc, -789.0
+  ret i1 %result
+}
+
+
+define i1 @fcmp_trunc_neg_ult(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_neg_ult(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ult double [[TMP0]], 0xC088A80010000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp ult float %trunc, -789.0
+  ret i1 %result
+}
+
+
+
+define i1 @fcmp_trunc_olt(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_olt(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp olt double [[TMP0]], 0x408ED7FFF0000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp olt float %trunc, 987.0
+  ret i1 %result
+}
+
+
+
+define i1 @fcmp_trunc_neg_olt(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_neg_olt(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp olt double [[TMP0]], 0xC088A80010000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp olt float %trunc, -789.0
+  ret i1 %result
+}
+
+
+define i1 @fcmp_trunc_neg_nsz_uge(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_neg_nsz_uge(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp nsz uge double [[TMP0]], 0xC05EC00010000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp nsz uge float %trunc, -123.0
+  ret i1 %result
+}
+
+
+
+define i1 @fcmp_trunc_reassoc_ugt(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_reassoc_ugt(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp reassoc ugt double [[TMP0]], 0x40889F8210000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp reassoc ugt float %trunc, 787.9384765625
+  ret i1 %result
+}
+
+
+define i1 @fcmp_trunc_neg_reassoc_ugt(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_neg_reassoc_ugt(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp reassoc ugt double [[TMP0]], 0xC0889F81F0000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp reassoc ugt float %trunc, -787.9384765625
+  ret i1 %result
+}
+
+
+
+define i1 @fcmp_trunc_fast_ult(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_fast_ult(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp fast uge double [[TMP0]], 0x40F8E8E010000001
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp fast uge float %trunc, 102030.0078125
+  ret i1 %result
+}
+
+
+define i1 @fcmp_trunc_neg_fast_ult(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_neg_fast_ult(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp fast uge double [[TMP0]], 0xC0F8E8E02FFFFFFF
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp fast uge float %trunc, -102030.0078125
+  ret i1 %result
+}
+
+
+; max representable float to fp128
+define i1 @fcmp_trunc_mx_fp128(fp128 %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_mx_fp128(
+; CHECK-SAME: fp128 [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ole fp128 [[TMP0]], 0xLFFFFFFFFFFFFFFFF407EFFFFFEFFFFFF
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc fp128 %0 to float
+  %result = fcmp ole float %trunc, 0x47EFFFFFE0000000
+  ret i1 %result
+}
+
+
+; max representable float to x86_fp80
+define i1 @fcmp_trunc_mx_x86_fp80(x86_fp80 %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_mx_x86_fp80(
+; CHECK-SAME: x86_fp80 [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ule x86_fp80 [[TMP0]], 0xK407EFFFFFF7FFFFFFFFF
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc x86_fp80 %0 to float
+  %result = fcmp ule float %trunc, 0x47EFFFFFE0000000
+  ret i1 %result
+}
+
+
+; max representable float to ppc_fp128
+define i1 @fcmp_trunc_mx_ppc_fp128(ppc_fp128 %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_mx_ppc_fp128(
+; CHECK-SAME: ppc_fp128 [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[TRUNC:%.*]] = fptrunc ppc_fp128 [[TMP0]] to float
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ole float [[TRUNC]], 0x47EFFFFFE0000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc ppc_fp128 %0 to float
+  %result = fcmp ole float %trunc, 0x47EFFFFFE0000000
+  ret i1 %result
+}
+
+
+; negative max representable float to fp128
+define i1 @fcmp_trunc_mn_fp128(fp128 %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_mn_fp128(
+; CHECK-SAME: fp128 [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp olt fp128 [[TMP0]], 0xL0000000000000000C07EFFFFF1000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc fp128 %0 to float
+  %result = fcmp olt float %trunc, 0xC7EFFFFF00000000
+  ret i1 %result
+}
+
+
+; negative max representable float to x86_fp80
+define i1 @fcmp_trunc_mn_x86_fp80(x86_fp80 %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_mn_x86_fp80(
+; CHECK-SAME: x86_fp80 [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp oge x86_fp80 [[TMP0]], 0xKC07EFFFFF88000000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc x86_fp80 %0 to float
+  %result = fcmp oge float %trunc, 0xC7EFFFFF00000000
+  ret i1 %result
+}
+
+
+; negative max representable float to ppc_fp128
+define i1 @fcmp_trunc_mn_ppc_fp128(ppc_fp128 %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_mn_ppc_fp128(
+; CHECK-SAME: ppc_fp128 [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp uge ppc_fp128 [[TMP0]], 0xMC7EFFFFF100000000000000000000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc ppc_fp128 %0 to float
+  %result = fcmp uge float %trunc, 0xC7EFFFFF00000000
+  ret i1 %result
+}
+
diff --git a/llvm/test/Transforms/LoopInterchange/force-interchange.ll b/llvm/test/Transforms/LoopInterchange/force-interchange.ll
new file mode 100644
index 0000000..c33ecdf
--- /dev/null
+++ b/llvm/test/Transforms/LoopInterchange/force-interchange.ll
@@ -0,0 +1,43 @@
+; RUN: opt < %s -passes=loop-interchange -pass-remarks-output=%t -disable-output -loop-interchange-profitabilities=ignore -S
+; RUN: FileCheck --input-file=%t %s
+
+; There should be no reason to interchange this, unless it is forced.
+;
+;     for (int i = 0; i<1024; i++)
+;       for (int j = 0; j<1024; j++)
+;         A[i][j] = 42;
+;
+; CHECK:      --- !Passed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        f
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          Loop interchanged with enclosing loop.
+; CHECK-NEXT: ...
+
+@A = dso_local local_unnamed_addr global [1024 x [1024 x i32]] zeroinitializer, align 4
+
+define dso_local void @f() local_unnamed_addr #0 {
+entry:
+  br label %outer.header
+
+outer.header:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %inner.header ]
+  br label %inner.body
+
+inner.header:
+  %i.next = add nuw nsw i64 %i, 1
+  %exitcond20.not = icmp eq i64 %i.next, 1024
+  br i1 %exitcond20.not, label %exit, label %outer.header
+
+inner.body:
+  %j = phi i64 [ 0, %outer.header ], [ %j.next, %inner.body ]
+  %arrayidx6 = getelementptr inbounds nuw [1024 x [1024 x i32]], ptr @A, i64 0, i64 %i, i64 %j
+  store i32 42, ptr %arrayidx6, align 4
+  %j.next = add nuw nsw i64 %j, 1
+  %exitcond.not = icmp eq i64 %j.next, 1024
+  br i1 %exitcond.not, label %inner.header, label %inner.body
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopInterchange/fp-reductions.ll b/llvm/test/Transforms/LoopInterchange/fp-reductions.ll
new file mode 100644
index 0000000..0703a7b
--- /dev/null
+++ b/llvm/test/Transforms/LoopInterchange/fp-reductions.ll
@@ -0,0 +1,437 @@
+; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -pass-remarks-output=%t -disable-output \
+; RUN:     -verify-dom-info -verify-loop-info -verify-loop-lcssa
+; RUN: FileCheck -input-file=%t %s
+
+; Check that the loops aren't exchanged if there is a reduction of
+; non-reassociative floating-point addition.
+;
+; float sum = 0;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     sum += A[j][i];
+
+; CHECK:      --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            UnsupportedPHIOuter
+; CHECK-NEXT: Function:        reduction_fadd
+define void @reduction_fadd(ptr %A) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %sum.i = phi float [ 0.0, %entry ], [ %sum.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %sum.j = phi float [ %sum.i, %for.i.header ], [ %sum.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx, align 4
+  %sum.j.next = fadd float %sum.j, %a
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %sum.i.lcssa = phi float [ %sum.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; Check that the interchange is legal if the floating-point addition is marked
+; as reassoc.
+;
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_reassoc_fadd
+define void @reduction_reassoc_fadd(ptr %A) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %sum.i = phi float [ 0.0, %entry ], [ %sum.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %sum.j = phi float [ %sum.i, %for.i.header ], [ %sum.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx, align 4
+  %sum.j.next = fadd reassoc float %sum.j, %a
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %sum.i.lcssa = phi float [ %sum.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; FIXME: Is it really legal to interchange the loops when
+; both reassoc and ninf are set?
+; Check that the interchange is legal if the floating-point addition is marked
+; as reassoc.
+;
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_reassoc_ninf_fadd
+define void @reduction_reassoc_ninf_fadd(ptr %A) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %sum.i = phi float [ 0.0, %entry ], [ %sum.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %sum.j = phi float [ %sum.i, %for.i.header ], [ %sum.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx, align 4
+  %sum.j.next = fadd reassoc ninf float %sum.j, %a
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %sum.i.lcssa = phi float [ %sum.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; Check that the loops aren't exchanged if there is a reduction of
+; non-reassociative floating-point multiplication.
+;
+; float prod = 1;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     prod *= A[j][i];
+
+; CHECK:      --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            UnsupportedPHIOuter
+; CHECK-NEXT: Function:        reduction_fmul
+define void @reduction_fmul(ptr %A) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %prod.i = phi float [ 1.0, %entry ], [ %prod.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %prod.j = phi float [ %prod.i, %for.i.header ], [ %prod.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx, align 4
+  %prod.j.next = fmul float %prod.j, %a
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %prod.i.lcssa = phi float [ %prod.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; Check that the interchange is legal if the floating-point multiplication is
+; marked as reassoc.
+;
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_reassoc_fmul
+define void @reduction_reassoc_fmul(ptr %A) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %prod.i = phi float [ 1.0, %entry ], [ %prod.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %prod.j = phi float [ %prod.i, %for.i.header ], [ %prod.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx, align 4
+  %prod.j.next = fmul reassoc float %prod.j, %a
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %prod.i.lcssa = phi float [ %prod.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; Check that the loops aren't exchanged if there is a reduction of
+; non-reassociative floating-point fmuladd.
+;
+; float fmuladd = 0;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     fmuladd += A[j][i] * B[j][i];
+
+; CHECK:      --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            UnsupportedPHIOuter
+; CHECK-NEXT: Function:        reduction_fmuladd
+define void @reduction_fmuladd(ptr %A, ptr %B) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %fmuladd.i = phi float [ 1.0, %entry ], [ %fmuladd.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %fmuladd.j = phi float [ %fmuladd.i, %for.i.header ], [ %fmuladd.j.next, %for.j ]
+  %idx.a = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %idx.b = getelementptr inbounds [2 x [2 x i32]], ptr %B, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx.a, align 4
+  %b = load float, ptr %idx.b, align 4
+  %fmuladd.j.next = call float @llvm.fmuladd.f32(float %a, float %b, float %fmuladd.j)
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %fmuladd.i.lcssa = phi float [ %fmuladd.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; Check that the interchange is legal if the floating-point fmuladd is marked
+; as reassoc.
+;
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_reassoc_fmuladd
+define void @reduction_reassoc_fmuladd(ptr %A, ptr %B) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %fmuladd.i = phi float [ 1.0, %entry ], [ %fmuladd.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %fmuladd.j = phi float [ %fmuladd.i, %for.i.header ], [ %fmuladd.j.next, %for.j ]
+  %idx.a = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %idx.b = getelementptr inbounds [2 x [2 x i32]], ptr %B, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx.a, align 4
+  %b = load float, ptr %idx.b, align 4
+  %fmuladd.j.next = call reassoc float @llvm.fmuladd.f32(float %a, float %b, float %fmuladd.j)
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %fmuladd.i.lcssa = phi float [ %fmuladd.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; Check that interchanging the loops is legal for the reassociative
+; floating-point minimum.
+;
+; float fmin = init;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     fmin = (A[j][i] < fmin) ? A[j][i] : fmin;
+
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_fmin
+define void @reduction_fmin(ptr %A, float %init) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %fmin.i = phi float [ %init, %entry ], [ %fmin.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %fmin.j = phi float [ %fmin.i, %for.i.header ], [ %fmin.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx, align 4
+  %cmp = fcmp nnan nsz olt float %a, %fmin.j
+  %fmin.j.next = select nnan nsz i1 %cmp, float %a, float %fmin.j
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %fmin.i.lcssa = phi float [ %fmin.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+
+; Check that interchanging the loops is legal for the floating-point
+; llvm.minimumnum.
+;
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_fmininumnum
+define void @reduction_fmininumnum(ptr %A, float %init) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %fmin.i = phi float [ %init, %entry ], [ %fmin.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %fmin.j = phi float [ %fmin.i, %for.i.header ], [ %fmin.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx, align 4
+  %fmin.j.next = call float @llvm.minimumnum.f32(float %a, float %fmin.j)
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %fmin.i.lcssa = phi float [ %fmin.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; Check that interchanging the loops is legal for the reassociative
+; floating-point maximum.
+;
+; float fmax = init;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     fmax = (A[j][i] > fmax) ? A[j][i] : fmax;
+
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_fmax
+define void @reduction_fmax(ptr %A, float %init) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %fmax.i = phi float [ %init, %entry ], [ %fmax.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %fmax.j = phi float [ %fmax.i, %for.i.header ], [ %fmax.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx, align 4
+  %cmp = fcmp nnan nsz ogt float %a, %fmax.j
+  %fmax.j.next = select nnan nsz i1 %cmp, float %a, float %fmax.j
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %fmax.i.lcssa = phi float [ %fmax.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; Check that interchanging the loops is legal for the floating-point
+; llvm.maximumnum.
+
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_fmaxinumnum
+define void @reduction_fmaxinumnum(ptr %A, float %init) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %fmax.i = phi float [ %init, %entry ], [ %fmax.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %fmax.j = phi float [ %fmax.i, %for.i.header ], [ %fmax.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx, align 4
+  %fmax.j.next = call float @llvm.maximumnum.f32(float %a, float %fmax.j)
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %fmax.i.lcssa = phi float [ %fmax.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+declare float @llvm.fmuladd.f32(float %a, float %b, float %c)
+declare float @llvm.minimumnum.f32(float %a, float %b)
+declare float @llvm.maximumnum.f32(float %a, float %b)
+\ No newline at end of file
diff --git a/llvm/test/Transforms/LoopInterchange/reductions-non-wrapped-operations.ll b/llvm/test/Transforms/LoopInterchange/reductions-non-wrapped-operations.ll
index 0eb6fe9..f5c6ad7 100644
--- a/llvm/test/Transforms/LoopInterchange/reductions-non-wrapped-operations.ll
+++ b/llvm/test/Transforms/LoopInterchange/reductions-non-wrapped-operations.ll
@@ -333,437 +333,3 @@ for.i.latch:
 exit:
   ret void
 }
-
-; Check that the loops aren't exchanged if there is a reduction of
-; non-reassociative floating-point addition.
-;
-; float sum = 0;
-; for (int i = 0; i < 2; i++)
-;   for (int j = 0; j < 2; j++)
-;     sum += A[j][i];
-
-; CHECK:      --- !Missed
-; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            UnsupportedPHIOuter
-; CHECK-NEXT: Function:        reduction_fadd
-define void @reduction_fadd(ptr %A) {
-entry:
-  br label %for.i.header
-
-for.i.header:
-  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
-  %sum.i = phi float [ 0.0, %entry ], [ %sum.i.lcssa, %for.i.latch ]
-  br label %for.j
-
-for.j:
-  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
-  %sum.j = phi float [ %sum.i, %for.i.header ], [ %sum.j.next, %for.j ]
-  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
-  %a = load float, ptr %idx, align 4
-  %sum.j.next = fadd float %sum.j, %a
-  %j.inc = add i32 %j, 1
-  %cmp.j = icmp slt i32 %j.inc, 2
-  br i1 %cmp.j, label %for.j, label %for.i.latch
-
-for.i.latch:
-  %sum.i.lcssa = phi float [ %sum.j.next, %for.j ]
-  %i.inc = add i32 %i, 1
-  %cmp.i = icmp slt i32 %i.inc, 2
-  br i1 %cmp.i, label %for.i.header, label %exit
-
-exit:
-  ret void
-}
-
-; Check that the interchange is legal if the floating-point addition is marked
-; as reassoc.
-;
-; CHECK:      --- !Pass
-; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            Interchanged
-; CHECK-NEXT: Function:        reduction_reassoc_fadd
-define void @reduction_reassoc_fadd(ptr %A) {
-entry:
-  br label %for.i.header
-
-for.i.header:
-  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
-  %sum.i = phi float [ 0.0, %entry ], [ %sum.i.lcssa, %for.i.latch ]
-  br label %for.j
-
-for.j:
-  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
-  %sum.j = phi float [ %sum.i, %for.i.header ], [ %sum.j.next, %for.j ]
-  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
-  %a = load float, ptr %idx, align 4
-  %sum.j.next = fadd reassoc float %sum.j, %a
-  %j.inc = add i32 %j, 1
-  %cmp.j = icmp slt i32 %j.inc, 2
-  br i1 %cmp.j, label %for.j, label %for.i.latch
-
-for.i.latch:
-  %sum.i.lcssa = phi float [ %sum.j.next, %for.j ]
-  %i.inc = add i32 %i, 1
-  %cmp.i = icmp slt i32 %i.inc, 2
-  br i1 %cmp.i, label %for.i.header, label %exit
-
-exit:
-  ret void
-}
-
-; FIXME: Is it really legal to interchange the loops when
-; both reassoc and ninf are set?
-; Check that the interchange is legal if the floating-point addition is marked
-; as reassoc.
-;
-; CHECK:      --- !Pass
-; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            Interchanged
-; CHECK-NEXT: Function:        reduction_reassoc_ninf_fadd
-define void @reduction_reassoc_ninf_fadd(ptr %A) {
-entry:
-  br label %for.i.header
-
-for.i.header:
-  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
-  %sum.i = phi float [ 0.0, %entry ], [ %sum.i.lcssa, %for.i.latch ]
-  br label %for.j
-
-for.j:
-  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
-  %sum.j = phi float [ %sum.i, %for.i.header ], [ %sum.j.next, %for.j ]
-  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
-  %a = load float, ptr %idx, align 4
-  %sum.j.next = fadd reassoc ninf float %sum.j, %a
-  %j.inc = add i32 %j, 1
-  %cmp.j = icmp slt i32 %j.inc, 2
-  br i1 %cmp.j, label %for.j, label %for.i.latch
-
-for.i.latch:
-  %sum.i.lcssa = phi float [ %sum.j.next, %for.j ]
-  %i.inc = add i32 %i, 1
-  %cmp.i = icmp slt i32 %i.inc, 2
-  br i1 %cmp.i, label %for.i.header, label %exit
-
-exit:
-  ret void
-}
-
-; Check that the loops aren't exchanged if there is a reduction of
-; non-reassociative floating-point multiplication.
-;
-; float prod = 1;
-; for (int i = 0; i < 2; i++)
-;   for (int j = 0; j < 2; j++)
-;     prod *= A[j][i];
-
-; CHECK:      --- !Missed
-; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            UnsupportedPHIOuter
-; CHECK-NEXT: Function:        reduction_fmul
-define void @reduction_fmul(ptr %A) {
-entry:
-  br label %for.i.header
-
-for.i.header:
-  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
-  %prod.i = phi float [ 1.0, %entry ], [ %prod.i.lcssa, %for.i.latch ]
-  br label %for.j
-
-for.j:
-  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
-  %prod.j = phi float [ %prod.i, %for.i.header ], [ %prod.j.next, %for.j ]
-  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
-  %a = load float, ptr %idx, align 4
-  %prod.j.next = fmul float %prod.j, %a
-  %j.inc = add i32 %j, 1
-  %cmp.j = icmp slt i32 %j.inc, 2
-  br i1 %cmp.j, label %for.j, label %for.i.latch
-
-for.i.latch:
-  %prod.i.lcssa = phi float [ %prod.j.next, %for.j ]
-  %i.inc = add i32 %i, 1
-  %cmp.i = icmp slt i32 %i.inc, 2
-  br i1 %cmp.i, label %for.i.header, label %exit
-
-exit:
-  ret void
-}
-
-; Check that the interchange is legal if the floating-point multiplication is
-; marked as reassoc.
-;
-; CHECK:      --- !Pass
-; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            Interchanged
-; CHECK-NEXT: Function:        reduction_reassoc_fmul
-define void @reduction_reassoc_fmul(ptr %A) {
-entry:
-  br label %for.i.header
-
-for.i.header:
-  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
-  %prod.i = phi float [ 1.0, %entry ], [ %prod.i.lcssa, %for.i.latch ]
-  br label %for.j
-
-for.j:
-  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
-  %prod.j = phi float [ %prod.i, %for.i.header ], [ %prod.j.next, %for.j ]
-  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
-  %a = load float, ptr %idx, align 4
-  %prod.j.next = fmul reassoc float %prod.j, %a
-  %j.inc = add i32 %j, 1
-  %cmp.j = icmp slt i32 %j.inc, 2
-  br i1 %cmp.j, label %for.j, label %for.i.latch
-
-for.i.latch:
-  %prod.i.lcssa = phi float [ %prod.j.next, %for.j ]
-  %i.inc = add i32 %i, 1
-  %cmp.i = icmp slt i32 %i.inc, 2
-  br i1 %cmp.i, label %for.i.header, label %exit
-
-exit:
-  ret void
-}
-
-; Check that the loops aren't exchanged if there is a reduction of
-; non-reassociative floating-point fmuladd.
-;
-; float fmuladd = 0;
-; for (int i = 0; i < 2; i++)
-;   for (int j = 0; j < 2; j++)
-;     fmuladd += A[j][i] * B[j][i];
-
-; CHECK:      --- !Missed
-; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            UnsupportedPHIOuter
-; CHECK-NEXT: Function:        reduction_fmuladd
-define void @reduction_fmuladd(ptr %A, ptr %B) {
-entry:
-  br label %for.i.header
-
-for.i.header:
-  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
-  %fmuladd.i = phi float [ 1.0, %entry ], [ %fmuladd.i.lcssa, %for.i.latch ]
-  br label %for.j
-
-for.j:
-  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
-  %fmuladd.j = phi float [ %fmuladd.i, %for.i.header ], [ %fmuladd.j.next, %for.j ]
-  %idx.a = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
-  %idx.b = getelementptr inbounds [2 x [2 x i32]], ptr %B, i32 0, i32 %j, i32 %i
-  %a = load float, ptr %idx.a, align 4
-  %b = load float, ptr %idx.b, align 4
-  %fmuladd.j.next = call float @llvm.fmuladd.f32(float %a, float %b, float %fmuladd.j)
-  %j.inc = add i32 %j, 1
-  %cmp.j = icmp slt i32 %j.inc, 2
-  br i1 %cmp.j, label %for.j, label %for.i.latch
-
-for.i.latch:
-  %fmuladd.i.lcssa = phi float [ %fmuladd.j.next, %for.j ]
-  %i.inc = add i32 %i, 1
-  %cmp.i = icmp slt i32 %i.inc, 2
-  br i1 %cmp.i, label %for.i.header, label %exit
-
-exit:
-  ret void
-}
-
-; Check that the interchange is legal if the floating-point fmuladd is marked
-; as reassoc.
-;
-; CHECK:      --- !Pass
-; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            Interchanged
-; CHECK-NEXT: Function:        reduction_reassoc_fmuladd
-define void @reduction_reassoc_fmuladd(ptr %A, ptr %B) {
-entry:
-  br label %for.i.header
-
-for.i.header:
-  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
-  %fmuladd.i = phi float [ 1.0, %entry ], [ %fmuladd.i.lcssa, %for.i.latch ]
-  br label %for.j
-
-for.j:
-  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
-  %fmuladd.j = phi float [ %fmuladd.i, %for.i.header ], [ %fmuladd.j.next, %for.j ]
-  %idx.a = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
-  %idx.b = getelementptr inbounds [2 x [2 x i32]], ptr %B, i32 0, i32 %j, i32 %i
-  %a = load float, ptr %idx.a, align 4
-  %b = load float, ptr %idx.b, align 4
-  %fmuladd.j.next = call reassoc float @llvm.fmuladd.f32(float %a, float %b, float %fmuladd.j)
-  %j.inc = add i32 %j, 1
-  %cmp.j = icmp slt i32 %j.inc, 2
-  br i1 %cmp.j, label %for.j, label %for.i.latch
-
-for.i.latch:
-  %fmuladd.i.lcssa = phi float [ %fmuladd.j.next, %for.j ]
-  %i.inc = add i32 %i, 1
-  %cmp.i = icmp slt i32 %i.inc, 2
-  br i1 %cmp.i, label %for.i.header, label %exit
-
-exit:
-  ret void
-}
-
-; Check that interchanging the loops is legal for the reassociative
-; floating-point minimum.
-;
-; float fmin = init;
-; for (int i = 0; i < 2; i++)
-;   for (int j = 0; j < 2; j++)
-;     fmin = (A[j][i] < fmin) ? A[j][i] : fmin;
-
-; CHECK:      --- !Pass
-; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            Interchanged
-; CHECK-NEXT: Function:        reduction_fmin
-define void @reduction_fmin(ptr %A, float %init) {
-entry:
-  br label %for.i.header
-
-for.i.header:
-  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
-  %fmin.i = phi float [ %init, %entry ], [ %fmin.i.lcssa, %for.i.latch ]
-  br label %for.j
-
-for.j:
-  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
-  %fmin.j = phi float [ %fmin.i, %for.i.header ], [ %fmin.j.next, %for.j ]
-  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
-  %a = load float, ptr %idx, align 4
-  %cmp = fcmp nnan nsz olt float %a, %fmin.j
-  %fmin.j.next = select nnan nsz i1 %cmp, float %a, float %fmin.j
-  %j.inc = add i32 %j, 1
-  %cmp.j = icmp slt i32 %j.inc, 2
-  br i1 %cmp.j, label %for.j, label %for.i.latch
-
-for.i.latch:
-  %fmin.i.lcssa = phi float [ %fmin.j.next, %for.j ]
-  %i.inc = add i32 %i, 1
-  %cmp.i = icmp slt i32 %i.inc, 2
-  br i1 %cmp.i, label %for.i.header, label %exit
-
-exit:
-  ret void
-}
-
-
-; Check that interchanging the loops is legal for the floating-point
-; llvm.minimumnum.
-;
-; CHECK:      --- !Pass
-; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            Interchanged
-; CHECK-NEXT: Function:        reduction_fmininumnum
-define void @reduction_fmininumnum(ptr %A, float %init) {
-entry:
-  br label %for.i.header
-
-for.i.header:
-  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
-  %fmin.i = phi float [ %init, %entry ], [ %fmin.i.lcssa, %for.i.latch ]
-  br label %for.j
-
-for.j:
-  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
-  %fmin.j = phi float [ %fmin.i, %for.i.header ], [ %fmin.j.next, %for.j ]
-  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
-  %a = load float, ptr %idx, align 4
-  %fmin.j.next = call float @llvm.minimumnum.f32(float %a, float %fmin.j)
-  %j.inc = add i32 %j, 1
-  %cmp.j = icmp slt i32 %j.inc, 2
-  br i1 %cmp.j, label %for.j, label %for.i.latch
-
-for.i.latch:
-  %fmin.i.lcssa = phi float [ %fmin.j.next, %for.j ]
-  %i.inc = add i32 %i, 1
-  %cmp.i = icmp slt i32 %i.inc, 2
-  br i1 %cmp.i, label %for.i.header, label %exit
-
-exit:
-  ret void
-}
-
-; Check that interchanging the loops is legal for the reassociative
-; floating-point maximum.
-;
-; float fmax = init;
-; for (int i = 0; i < 2; i++)
-;   for (int j = 0; j < 2; j++)
-;     fmax = (A[j][i] > fmax) ? A[j][i] : fmax;
-
-; CHECK:      --- !Pass
-; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            Interchanged
-; CHECK-NEXT: Function:        reduction_fmax
-define void @reduction_fmax(ptr %A, float %init) {
-entry:
-  br label %for.i.header
-
-for.i.header:
-  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
-  %fmax.i = phi float [ %init, %entry ], [ %fmax.i.lcssa, %for.i.latch ]
-  br label %for.j
-
-for.j:
-  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
-  %fmax.j = phi float [ %fmax.i, %for.i.header ], [ %fmax.j.next, %for.j ]
-  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
-  %a = load float, ptr %idx, align 4
-  %cmp = fcmp nnan nsz ogt float %a, %fmax.j
-  %fmax.j.next = select nnan nsz i1 %cmp, float %a, float %fmax.j
-  %j.inc = add i32 %j, 1
-  %cmp.j = icmp slt i32 %j.inc, 2
-  br i1 %cmp.j, label %for.j, label %for.i.latch
-
-for.i.latch:
-  %fmax.i.lcssa = phi float [ %fmax.j.next, %for.j ]
-  %i.inc = add i32 %i, 1
-  %cmp.i = icmp slt i32 %i.inc, 2
-  br i1 %cmp.i, label %for.i.header, label %exit
-
-exit:
-  ret void
-}
-
-; Check that interchanging the loops is legal for the floating-point
-; llvm.maximumnum.
-
-; CHECK:      --- !Pass
-; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            Interchanged
-; CHECK-NEXT: Function:        reduction_fmaxinumnum
-define void @reduction_fmaxinumnum(ptr %A, float %init) {
-entry:
-  br label %for.i.header
-
-for.i.header:
-  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
-  %fmax.i = phi float [ %init, %entry ], [ %fmax.i.lcssa, %for.i.latch ]
-  br label %for.j
-
-for.j:
-  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
-  %fmax.j = phi float [ %fmax.i, %for.i.header ], [ %fmax.j.next, %for.j ]
-  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
-  %a = load float, ptr %idx, align 4
-  %fmax.j.next = call float @llvm.maximumnum.f32(float %a, float %fmax.j)
-  %j.inc = add i32 %j, 1
-  %cmp.j = icmp slt i32 %j.inc, 2
-  br i1 %cmp.j, label %for.j, label %for.i.latch
-
-for.i.latch:
-  %fmax.i.lcssa = phi float [ %fmax.j.next, %for.j ]
-  %i.inc = add i32 %i, 1
-  %cmp.i = icmp slt i32 %i.inc, 2
-  br i1 %cmp.i, label %for.i.header, label %exit
-
-exit:
-  ret void
-}
-
-declare float @llvm.fmuladd.f32(float %a, float %b, float %c)
-declare float @llvm.minimumnum.f32(float %a, float %b)
-declare float @llvm.maximumnum.f32(float %a, float %b)
diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/lifetime-use.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/lifetime-use.ll
new file mode 100644
index 0000000..c7a0de22
--- /dev/null
+++ b/llvm/test/Transforms/LoopStrengthReduce/X86/lifetime-use.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=loop-reduce -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define void @test(ptr %p, i64 %idx) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4 x [4 x i32]], align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[ALLOCA]])
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[IDX]], 6
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 48
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP1]]
+; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr nuw i8, ptr [[ALLOCA]], i64 48
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], %[[LOOP]] ], [ -8, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nsw i64 [[LSR_IV]], 2
+; CHECK-NEXT:    [[SCEVGEP8:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP2]]
+; CHECK-NEXT:    [[SCEVGEP9:%.*]] = getelementptr i8, ptr [[SCEVGEP8]], i64 32
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[SCEVGEP9]], align 4
+; CHECK-NEXT:    [[SCEVGEP6:%.*]] = getelementptr i8, ptr [[P]], i64 [[LSR_IV]]
+; CHECK-NEXT:    [[SCEVGEP7:%.*]] = getelementptr i8, ptr [[SCEVGEP6]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[SCEVGEP7]], align 4
+; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[SCEVGEP3]], i64 [[LSR_IV]]
+; CHECK-NEXT:    [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[SCEVGEP4]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[SCEVGEP5]], align 4
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 [[LSR_IV]]
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SCEVGEP1]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[SCEVGEP2]], align 4
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add nsw i64 [[LSR_IV]], 4
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[ALLOCA]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %alloca = alloca [4 x [4 x i32]], align 16
+  call void @llvm.lifetime.start.p0(i64 64, ptr %alloca)
+  br label %loop
+
+loop:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %loop ]
+  %gep1 = getelementptr [4 x [12 x [4 x [4 x i32]]]], ptr %p, i64 0, i64 0, i64 0, i64 %indvars.iv, i64 0
+  %0 = load i32, ptr %gep1, align 4
+  %gep2 = getelementptr [6 x [4 x [4 x i32]]], ptr %p, i64 0, i64 0, i64 0, i64 %indvars.iv
+  %1 = load i32, ptr %gep2, align 4
+  %gep3 = getelementptr [4 x [4 x i32]], ptr %alloca, i64 0, i64 3, i64 %indvars.iv
+  %2 = load i32, ptr %gep3, align 4
+  %gep4 = getelementptr [6 x [4 x [4 x i32]]], ptr %p, i64 0, i64 %idx, i64 3, i64 %indvars.iv
+  %3 = load i32, ptr %gep4, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv, 1
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  call void @llvm.lifetime.end.p0(i64 64, ptr %alloca)
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll
index 1a091e8..0b78bee 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll
@@ -578,8 +578,323 @@ loop.latch:
 exit:
   ret void
 }
+
+define i32 @test_add_reduction_unroll_partial(ptr %a, i64 noundef %n) {
+; APPLE-LABEL: define i32 @test_add_reduction_unroll_partial(
+; APPLE-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; APPLE-NEXT:  [[ENTRY:.*]]:
+; APPLE-NEXT:    br label %[[LOOP:.*]]
+; APPLE:       [[LOOP]]:
+; APPLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; APPLE-NEXT:    [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ]
+; APPLE-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; APPLE-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP_A]], align 2
+; APPLE-NEXT:    [[RDX_NEXT]] = add nuw nsw i32 [[RDX]], [[TMP0]]
+; APPLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; APPLE-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; APPLE-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; APPLE:       [[EXIT]]:
+; APPLE-NEXT:    [[BIN_RDX2:%.*]] = phi i32 [ [[RDX_NEXT]], %[[LOOP]] ]
+; APPLE-NEXT:    ret i32 [[BIN_RDX2]]
+;
+; OTHER-LABEL: define i32 @test_add_reduction_unroll_partial(
+; OTHER-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; OTHER-NEXT:  [[ENTRY:.*]]:
+; OTHER-NEXT:    br label %[[LOOP:.*]]
+; OTHER:       [[LOOP]]:
+; OTHER-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
+; OTHER-NEXT:    [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
+; OTHER-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; OTHER-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP_A]], align 2
+; OTHER-NEXT:    [[RDX_NEXT:%.*]] = add nuw nsw i32 [[RDX]], [[TMP0]]
+; OTHER-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; OTHER-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT]]
+; OTHER-NEXT:    [[TMP1:%.*]] = load i32, ptr [[GEP_A_1]], align 2
+; OTHER-NEXT:    [[RDX_2:%.*]] = add nuw nsw i32 [[RDX_NEXT]], [[TMP1]]
+; OTHER-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; OTHER-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_1]]
+; OTHER-NEXT:    [[TMP2:%.*]] = load i32, ptr [[GEP_A_2]], align 2
+; OTHER-NEXT:    [[RDX_NEXT_2:%.*]] = add nuw nsw i32 [[RDX_2]], [[TMP2]]
+; OTHER-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; OTHER-NEXT:    [[GEP_A_3:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_2]]
+; OTHER-NEXT:    [[TMP3:%.*]] = load i32, ptr [[GEP_A_3]], align 2
+; OTHER-NEXT:    [[RDX_NEXT_3]] = add nuw nsw i32 [[RDX_NEXT_2]], [[TMP3]]
+; OTHER-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
+; OTHER-NEXT:    [[EC_3:%.*]] = icmp eq i64 [[IV_NEXT_3]], 1024
+; OTHER-NEXT:    br i1 [[EC_3]], label %[[EXIT:.*]], label %[[LOOP]]
+; OTHER:       [[EXIT]]:
+; OTHER-NEXT:    [[BIN_RDX2:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
+; OTHER-NEXT:    ret i32 [[BIN_RDX2]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ 0, %entry ], [ %rdx.next, %loop ]
+  %gep.a = getelementptr inbounds nuw i32, ptr %a, i64 %iv
+  %1 = load i32, ptr %gep.a, align 2
+  %rdx.next = add nuw nsw i32 %rdx, %1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 1024
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  %res = phi i32 [ %rdx.next, %loop ]
+  ret i32 %res
+}
+
+declare i1 @cond()
+
+define i32 @test_add_reduction_multi_block(ptr %a, i64 noundef %n) {
+; APPLE-LABEL: define i32 @test_add_reduction_multi_block(
+; APPLE-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; APPLE-NEXT:  [[ENTRY:.*]]:
+; APPLE-NEXT:    br label %[[LOOP:.*]]
+; APPLE:       [[LOOP]]:
+; APPLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; APPLE-NEXT:    [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP_LATCH]] ]
+; APPLE-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; APPLE-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP_A]], align 2
+; APPLE-NEXT:    [[C:%.*]] = call i1 @cond()
+; APPLE-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
+; APPLE:       [[THEN]]:
+; APPLE-NEXT:    store i32 0, ptr [[GEP_A]], align 4
+; APPLE-NEXT:    br label %[[LOOP_LATCH]]
+; APPLE:       [[LOOP_LATCH]]:
+; APPLE-NEXT:    [[RDX_NEXT]] = add nuw nsw i32 [[RDX]], [[TMP0]]
+; APPLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; APPLE-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; APPLE-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; APPLE:       [[EXIT]]:
+; APPLE-NEXT:    [[RES:%.*]] = phi i32 [ [[RDX_NEXT]], %[[LOOP_LATCH]] ]
+; APPLE-NEXT:    ret i32 [[RES]]
+;
+; OTHER-LABEL: define i32 @test_add_reduction_multi_block(
+; OTHER-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; OTHER-NEXT:  [[ENTRY:.*]]:
+; OTHER-NEXT:    br label %[[LOOP:.*]]
+; OTHER:       [[LOOP]]:
+; OTHER-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; OTHER-NEXT:    [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP_LATCH]] ]
+; OTHER-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; OTHER-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP_A]], align 2
+; OTHER-NEXT:    [[C:%.*]] = call i1 @cond()
+; OTHER-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
+; OTHER:       [[THEN]]:
+; OTHER-NEXT:    store i32 0, ptr [[GEP_A]], align 4
+; OTHER-NEXT:    br label %[[LOOP_LATCH]]
+; OTHER:       [[LOOP_LATCH]]:
+; OTHER-NEXT:    [[RDX_NEXT]] = add nuw nsw i32 [[RDX]], [[TMP0]]
+; OTHER-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; OTHER-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; OTHER-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; OTHER:       [[EXIT]]:
+; OTHER-NEXT:    [[RES:%.*]] = phi i32 [ [[RDX_NEXT]], %[[LOOP_LATCH]] ]
+; OTHER-NEXT:    ret i32 [[RES]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %rdx = phi i32 [ 0, %entry ], [ %rdx.next, %loop.latch ]
+  %gep.a = getelementptr inbounds nuw i32, ptr %a, i64 %iv
+  %1 = load i32, ptr %gep.a, align 2
+  %c = call i1 @cond()
+  br i1 %c, label %then, label %loop.latch
+
+then:
+  store i32 0, ptr %gep.a
+  br label %loop.latch
+
+loop.latch:
+  %rdx.next = add nuw nsw i32 %rdx, %1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 1024
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  %res = phi i32 [ %rdx.next, %loop.latch ]
+  ret i32 %res
+}
+
+define i32 @test_add_and_mul_reduction_unroll_partial(ptr %a, i64 noundef %n) {
+; APPLE-LABEL: define i32 @test_add_and_mul_reduction_unroll_partial(
+; APPLE-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; APPLE-NEXT:  [[ENTRY:.*]]:
+; APPLE-NEXT:    br label %[[LOOP:.*]]
+; APPLE:       [[LOOP]]:
+; APPLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; APPLE-NEXT:    [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ]
+; APPLE-NEXT:    [[RDX_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_2_NEXT:%.*]], %[[LOOP]] ]
+; APPLE-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; APPLE-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP_A]], align 2
+; APPLE-NEXT:    [[RDX_NEXT]] = add nuw nsw i32 [[RDX]], [[TMP0]]
+; APPLE-NEXT:    [[RDX_2_NEXT]] = mul i32 [[RDX_2]], [[TMP0]]
+; APPLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; APPLE-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; APPLE-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; APPLE:       [[EXIT]]:
+; APPLE-NEXT:    [[BIN_RDX3:%.*]] = phi i32 [ [[RDX_NEXT]], %[[LOOP]] ]
+; APPLE-NEXT:    [[RES_2:%.*]] = phi i32 [ [[RDX_2_NEXT]], %[[LOOP]] ]
+; APPLE-NEXT:    [[SUM:%.*]] = add i32 [[BIN_RDX3]], [[RES_2]]
+; APPLE-NEXT:    ret i32 [[SUM]]
+;
+; OTHER-LABEL: define i32 @test_add_and_mul_reduction_unroll_partial(
+; OTHER-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; OTHER-NEXT:  [[ENTRY:.*]]:
+; OTHER-NEXT:    br label %[[LOOP:.*]]
+; OTHER:       [[LOOP]]:
+; OTHER-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_1:%.*]], %[[LOOP]] ]
+; OTHER-NEXT:    [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ]
+; OTHER-NEXT:    [[RDX_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_2_NEXT_1:%.*]], %[[LOOP]] ]
+; OTHER-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; OTHER-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP_A]], align 2
+; OTHER-NEXT:    [[RDX_NEXT:%.*]] = add nuw nsw i32 [[RDX]], [[TMP0]]
+; OTHER-NEXT:    [[RDX_2_NEXT:%.*]] = mul i32 [[RDX_2]], [[TMP0]]
+; OTHER-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; OTHER-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT]]
+; OTHER-NEXT:    [[TMP1:%.*]] = load i32, ptr [[GEP_A_1]], align 2
+; OTHER-NEXT:    [[RDX_NEXT_1]] = add nuw nsw i32 [[RDX_NEXT]], [[TMP1]]
+; OTHER-NEXT:    [[RDX_2_NEXT_1]] = mul i32 [[RDX_2_NEXT]], [[TMP1]]
+; OTHER-NEXT:    [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2
+; OTHER-NEXT:    [[EC_1:%.*]] = icmp eq i64 [[IV_NEXT_1]], 1024
+; OTHER-NEXT:    br i1 [[EC_1]], label %[[EXIT:.*]], label %[[LOOP]]
+; OTHER:       [[EXIT]]:
+; OTHER-NEXT:    [[BIN_RDX:%.*]] = phi i32 [ [[RDX_NEXT_1]], %[[LOOP]] ]
+; OTHER-NEXT:    [[RES_2:%.*]] = phi i32 [ [[RDX_2_NEXT_1]], %[[LOOP]] ]
+; OTHER-NEXT:    [[SUM:%.*]] = add i32 [[BIN_RDX]], [[RES_2]]
+; OTHER-NEXT:    ret i32 [[SUM]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ 0, %entry ], [ %rdx.next, %loop ]
+  %rdx.2 = phi i32 [ 0, %entry ], [ %rdx.2.next, %loop ]
+  %gep.a = getelementptr inbounds nuw i32, ptr %a, i64 %iv
+  %1 = load i32, ptr %gep.a, align 2
+  %rdx.next = add nuw nsw i32 %rdx, %1
+  %rdx.2.next = mul i32 %rdx.2, %1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 1024
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  %res.1 = phi i32 [ %rdx.next, %loop ]
+  %res.2 = phi i32 [ %rdx.2.next, %loop ]
+  %sum = add i32 %res.1, %res.2
+  ret i32 %sum
+}
+
+
+define i32 @test_add_reduction_runtime(ptr %a, i64 noundef %n) {
+; APPLE-LABEL: define i32 @test_add_reduction_runtime(
+; APPLE-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; APPLE-NEXT:  [[ENTRY:.*]]:
+; APPLE-NEXT:    br label %[[LOOP:.*]]
+; APPLE:       [[LOOP]]:
+; APPLE-NEXT:    [[IV_EPIL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_EPIL:%.*]], %[[LOOP]] ]
+; APPLE-NEXT:    [[RDX_EPIL:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_EPIL:%.*]], %[[LOOP]] ]
+; APPLE-NEXT:    [[GEP_A_EPIL:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_EPIL]]
+; APPLE-NEXT:    [[TMP6:%.*]] = load i32, ptr [[GEP_A_EPIL]], align 2
+; APPLE-NEXT:    [[RDX_NEXT_EPIL]] = add nuw nsw i32 [[RDX_EPIL]], [[TMP6]]
+; APPLE-NEXT:    [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1
+; APPLE-NEXT:    [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[N]]
+; APPLE-NEXT:    br i1 [[EC_EPIL]], label %[[EXIT:.*]], label %[[LOOP]]
+; APPLE:       [[EXIT]]:
+; APPLE-NEXT:    [[RES:%.*]] = phi i32 [ [[RDX_NEXT_EPIL]], %[[LOOP]] ]
+; APPLE-NEXT:    ret i32 [[RES]]
+;
+; OTHER-LABEL: define i32 @test_add_reduction_runtime(
+; OTHER-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; OTHER-NEXT:  [[ENTRY:.*]]:
+; OTHER-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; OTHER-NEXT:    [[XTRAITER:%.*]] = and i64 [[N]], 3
+; OTHER-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 3
+; OTHER-NEXT:    br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
+; OTHER:       [[ENTRY_NEW]]:
+; OTHER-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
+; OTHER-NEXT:    br label %[[LOOP:.*]]
+; OTHER:       [[LOOP]]:
+; OTHER-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
+; OTHER-NEXT:    [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
+; OTHER-NEXT:    [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_3:%.*]], %[[LOOP]] ]
+; OTHER-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; OTHER-NEXT:    [[TMP2:%.*]] = load i32, ptr [[GEP_A]], align 2
+; OTHER-NEXT:    [[RDX_NEXT:%.*]] = add nuw nsw i32 [[RDX]], [[TMP2]]
+; OTHER-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; OTHER-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT]]
+; OTHER-NEXT:    [[TMP3:%.*]] = load i32, ptr [[GEP_A_1]], align 2
+; OTHER-NEXT:    [[RDX_2:%.*]] = add nuw nsw i32 [[RDX_NEXT]], [[TMP3]]
+; OTHER-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; OTHER-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_1]]
+; OTHER-NEXT:    [[TMP4:%.*]] = load i32, ptr [[GEP_A_2]], align 2
+; OTHER-NEXT:    [[RDX_NEXT_2:%.*]] = add nuw nsw i32 [[RDX_2]], [[TMP4]]
+; OTHER-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; OTHER-NEXT:    [[GEP_A_3:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_2]]
+; OTHER-NEXT:    [[TMP5:%.*]] = load i32, ptr [[GEP_A_3]], align 2
+; OTHER-NEXT:    [[RDX_NEXT_3]] = add nuw nsw i32 [[RDX_NEXT_2]], [[TMP5]]
+; OTHER-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
+; OTHER-NEXT:    [[NITER_NEXT_3]] = add i64 [[NITER]], 4
+; OTHER-NEXT:    [[NITER_NCMP_3:%.*]] = icmp eq i64 [[NITER_NEXT_3]], [[UNROLL_ITER]]
+; OTHER-NEXT:    br i1 [[NITER_NCMP_3]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP]]
+; OTHER:       [[EXIT_UNR_LCSSA_LOOPEXIT]]:
+; OTHER-NEXT:    [[RES_PH_PH:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
+; OTHER-NEXT:    [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_3]], %[[LOOP]] ]
+; OTHER-NEXT:    [[RDX_UNR_PH:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
+; OTHER-NEXT:    br label %[[EXIT_UNR_LCSSA]]
+; OTHER:       [[EXIT_UNR_LCSSA]]:
+; OTHER-NEXT:    [[RES_PH:%.*]] = phi i32 [ poison, %[[ENTRY]] ], [ [[RES_PH_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; OTHER-NEXT:    [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; OTHER-NEXT:    [[RDX_UNR:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; OTHER-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; OTHER-NEXT:    br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
+; OTHER:       [[LOOP_EPIL_PREHEADER]]:
+; OTHER-NEXT:    br label %[[LOOP_EPIL:.*]]
+; OTHER:       [[LOOP_EPIL]]:
+; OTHER-NEXT:    [[IV_EPIL:%.*]] = phi i64 [ [[IV_UNR]], %[[LOOP_EPIL_PREHEADER]] ], [ [[IV_NEXT_EPIL:%.*]], %[[LOOP_EPIL]] ]
+; OTHER-NEXT:    [[RDX_EPIL:%.*]] = phi i32 [ [[RDX_UNR]], %[[LOOP_EPIL_PREHEADER]] ], [ [[RDX_NEXT_EPIL:%.*]], %[[LOOP_EPIL]] ]
+; OTHER-NEXT:    [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[LOOP_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[LOOP_EPIL]] ]
+; OTHER-NEXT:    [[GEP_A_EPIL:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_EPIL]]
+; OTHER-NEXT:    [[TMP6:%.*]] = load i32, ptr [[GEP_A_EPIL]], align 2
+; OTHER-NEXT:    [[RDX_NEXT_EPIL]] = add nuw nsw i32 [[RDX_EPIL]], [[TMP6]]
+; OTHER-NEXT:    [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1
+; OTHER-NEXT:    [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[N]]
+; OTHER-NEXT:    [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
+; OTHER-NEXT:    [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
+; OTHER-NEXT:    br i1 [[EPIL_ITER_CMP]], label %[[LOOP_EPIL]], label %[[EXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; OTHER:       [[EXIT_EPILOG_LCSSA]]:
+; OTHER-NEXT:    [[RES_PH1:%.*]] = phi i32 [ [[RDX_NEXT_EPIL]], %[[LOOP_EPIL]] ]
+; OTHER-NEXT:    br label %[[EXIT]]
+; OTHER:       [[EXIT]]:
+; OTHER-NEXT:    [[RES:%.*]] = phi i32 [ [[RES_PH]], %[[EXIT_UNR_LCSSA]] ], [ [[RES_PH1]], %[[EXIT_EPILOG_LCSSA]] ]
+; OTHER-NEXT:    ret i32 [[RES]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ 0, %entry ], [ %rdx.next, %loop ]
+  %gep.a = getelementptr inbounds nuw i32, ptr %a, i64 %iv
+  %1 = load i32, ptr %gep.a, align 2
+  %rdx.next = add nuw nsw i32 %rdx, %1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  %res = phi i32 [ %rdx.next, %loop ]
+  ret i32 %res
+}
 ;.
 ; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
 ; APPLE: [[META1]] = !{!"llvm.loop.unroll.disable"}
 ; APPLE: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]}
 ;.
+; OTHER: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; OTHER: [[META1]] = !{!"llvm.loop.unroll.disable"}
+;.
diff --git a/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll b/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll
new file mode 100644
index 0000000..953dc278
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll
@@ -0,0 +1,446 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-unroll -unroll-allow-partial -unroll-max-count=4 -S %s | FileCheck %s
+
+define i32 @test_add(ptr %src, i64 %n, i32 %start) {
+; CHECK-LABEL: define i32 @test_add(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1
+; CHECK-NEXT:    [[RDX_NEXT:%.*]] = add i32 [[RDX]], [[L]]
+; CHECK-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_1:%.*]] = add i32 [[RDX_NEXT]], [[L_1]]
+; CHECK-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_1]]
+; CHECK-NEXT:    [[L_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_2:%.*]] = add i32 [[RDX_NEXT_1]], [[L_2]]
+; CHECK-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[GEP_SRC_24:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_2]]
+; CHECK-NEXT:    [[L_24:%.*]] = load i32, ptr [[GEP_SRC_24]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_3]] = add i32 [[RDX_NEXT_2]], [[L_24]]
+; CHECK-NEXT:    [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
+; CHECK-NEXT:    br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RDX_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[RDX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ %start, %entry ], [ %rdx.next, %loop ]
+  %iv.next = add i64 %iv, 1
+  %gep.src = getelementptr i32, ptr %src, i64 %iv
+  %l = load i32 , ptr %gep.src, align 1
+  %rdx.next = add i32 %rdx, %l
+  %ec = icmp ne i64 %iv.next, 1000
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret i32 %rdx.next
+}
+
+define i32 @test_add_tc_not_multiple_of_4(ptr %src, i64 %n, i32 %start) {
+; CHECK-LABEL: define i32 @test_add_tc_not_multiple_of_4(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP_1:.*]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP_1]] ]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1
+; CHECK-NEXT:    [[RDX_NEXT:%.*]] = add i32 [[RDX]], [[L]]
+; CHECK-NEXT:    [[EC:%.*]] = icmp ne i64 [[IV_NEXT]], 1001
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP_1]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_1]]:
+; CHECK-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_1:%.*]] = add i32 [[RDX_NEXT]], [[L_1]]
+; CHECK-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_1]]
+; CHECK-NEXT:    [[L_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_2:%.*]] = add i32 [[RDX_NEXT_1]], [[L_2]]
+; CHECK-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[GEP_SRC_12:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_2]]
+; CHECK-NEXT:    [[L_12:%.*]] = load i32, ptr [[GEP_SRC_12]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_3]] = add i32 [[RDX_NEXT_2]], [[L_12]]
+; CHECK-NEXT:    br label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RDX_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[RDX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ %start, %entry ], [ %rdx.next, %loop ]
+  %iv.next = add i64 %iv, 1
+  %gep.src = getelementptr i32, ptr %src, i64 %iv
+  %l = load i32 , ptr %gep.src, align 1
+  %rdx.next = add i32 %rdx, %l
+  %ec = icmp ne i64 %iv.next, 1001
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret i32 %rdx.next
+}
+
+define i32 @test_add_rdx_used_in_loop(ptr %src, i64 %n, i32 %start) {
+; CHECK-LABEL: define i32 @test_add_rdx_used_in_loop(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT_24:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1
+; CHECK-NEXT:    [[RDX_NEXT:%.*]] = add i32 [[RDX]], [[L]]
+; CHECK-NEXT:    store i32 [[RDX_NEXT]], ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_1:%.*]] = add i32 [[RDX_NEXT]], [[L_1]]
+; CHECK-NEXT:    store i32 [[RDX_NEXT_1]], ptr [[GEP_SRC_1]], align 4
+; CHECK-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_1]]
+; CHECK-NEXT:    [[L_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_2:%.*]] = add i32 [[RDX_NEXT_1]], [[L_2]]
+; CHECK-NEXT:    store i32 [[RDX_NEXT_2]], ptr [[GEP_SRC_2]], align 4
+; CHECK-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[GEP_SRC_24:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_2]]
+; CHECK-NEXT:    [[L_24:%.*]] = load i32, ptr [[GEP_SRC_24]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_24]] = add i32 [[RDX_NEXT_2]], [[L_24]]
+; CHECK-NEXT:    store i32 [[RDX_NEXT_24]], ptr [[GEP_SRC_24]], align 4
+; CHECK-NEXT:    [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
+; CHECK-NEXT:    br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RDX_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT_24]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[RDX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ %start, %entry ], [ %rdx.next, %loop ]
+  %iv.next = add i64 %iv, 1
+  %gep.src = getelementptr i32, ptr %src, i64 %iv
+  %l = load i32 , ptr %gep.src, align 1
+  %rdx.next = add i32 %rdx, %l
+  store i32 %rdx.next, ptr %gep.src
+  %ec = icmp ne i64 %iv.next, 1000
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret i32 %rdx.next
+}
+
+define i32 @test_add_phi_used_outside_loop(ptr %src, i64 %n, i32 %start) {
+; CHECK-LABEL: define i32 @test_add_phi_used_outside_loop(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1
+; CHECK-NEXT:    [[RDX_NEXT:%.*]] = add i32 [[RDX]], [[L]]
+; CHECK-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_1:%.*]] = add i32 [[RDX_NEXT]], [[L_1]]
+; CHECK-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_1]]
+; CHECK-NEXT:    [[L_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_2:%.*]] = add i32 [[RDX_NEXT_1]], [[L_2]]
+; CHECK-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[GEP_SRC_24:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_2]]
+; CHECK-NEXT:    [[L_24:%.*]] = load i32, ptr [[GEP_SRC_24]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_3]] = add i32 [[RDX_NEXT_2]], [[L_24]]
+; CHECK-NEXT:    [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
+; CHECK-NEXT:    br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RDX_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT_2]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[RDX_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ %start, %entry ], [ %rdx.next, %loop ]
+  %iv.next = add i64 %iv, 1
+  %gep.src = getelementptr i32, ptr %src, i64 %iv
+  %l = load i32 , ptr %gep.src, align 1
+  %rdx.next = add i32 %rdx, %l
+  %ec = icmp ne i64 %iv.next, 1000
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret i32 %rdx
+}
+
+define i32 @test_add_and_mul_reduction(ptr %src, i64 %n, i32 %start) {
+; CHECK-LABEL: define i32 @test_add_and_mul_reduction(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_1:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_1_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_2:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_2_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1
+; CHECK-NEXT:    [[RDX_1_NEXT:%.*]] = add i32 [[RDX_1]], [[L]]
+; CHECK-NEXT:    [[RDX_2_NEXT:%.*]] = mul i32 [[RDX_2]], [[L]]
+; CHECK-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 1
+; CHECK-NEXT:    [[RDX_1_2:%.*]] = add i32 [[RDX_1_NEXT]], [[L_1]]
+; CHECK-NEXT:    [[RDX_2_2:%.*]] = mul i32 [[RDX_2_NEXT]], [[L_1]]
+; CHECK-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_1]]
+; CHECK-NEXT:    [[L_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 1
+; CHECK-NEXT:    [[RDX_1_NEXT_2:%.*]] = add i32 [[RDX_1_2]], [[L_2]]
+; CHECK-NEXT:    [[RDX_2_NEXT_2:%.*]] = mul i32 [[RDX_2_2]], [[L_2]]
+; CHECK-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[GEP_SRC_24:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_2]]
+; CHECK-NEXT:    [[L_24:%.*]] = load i32, ptr [[GEP_SRC_24]], align 1
+; CHECK-NEXT:    [[RDX_1_NEXT_3]] = add i32 [[RDX_1_NEXT_2]], [[L_24]]
+; CHECK-NEXT:    [[RDX_2_NEXT_3]] = mul i32 [[RDX_2_NEXT_2]], [[L_24]]
+; CHECK-NEXT:    [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
+; CHECK-NEXT:    br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RDX_1_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_1_NEXT_3]], %[[LOOP]] ]
+; CHECK-NEXT:    [[BIN_RDX5:%.*]] = phi i32 [ [[RDX_2_NEXT_3]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RES:%.*]] = add i32 [[RDX_1_NEXT_LCSSA]], [[BIN_RDX5]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx.1 = phi i32 [ %start, %entry ], [ %rdx.1.next, %loop ]
+  %rdx.2 = phi i32 [ %start, %entry ], [ %rdx.2.next, %loop ]
+  %iv.next = add i64 %iv, 1
+  %gep.src = getelementptr i32, ptr %src, i64 %iv
+  %l = load i32 , ptr %gep.src, align 1
+  %rdx.1.next = add i32 %rdx.1, %l
+  %rdx.2.next = mul i32 %rdx.2, %l
+  %ec = icmp ne i64 %iv.next, 1000
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  %res = add i32 %rdx.1.next, %rdx.2.next
+  ret i32 %res
+}
+
+define float @test_fadd_no_fmfs(ptr %src, i64 %n, float %start) {
+; CHECK-LABEL: define float @test_fadd_no_fmfs(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], float [[START:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi float [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 1
+; CHECK-NEXT:    [[RDX_NEXT:%.*]] = fadd float [[RDX]], [[L]]
+; CHECK-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr float, ptr [[SRC]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_1:%.*]] = fadd float [[RDX_NEXT]], [[L_1]]
+; CHECK-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr float, ptr [[SRC]], i64 [[IV_NEXT_1]]
+; CHECK-NEXT:    [[L_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_2:%.*]] = fadd float [[RDX_NEXT_1]], [[L_2]]
+; CHECK-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[GEP_SRC_24:%.*]] = getelementptr float, ptr [[SRC]], i64 [[IV_NEXT_2]]
+; CHECK-NEXT:    [[L_24:%.*]] = load float, ptr [[GEP_SRC_24]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_3]] = fadd float [[RDX_NEXT_2]], [[L_24]]
+; CHECK-NEXT:    [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
+; CHECK-NEXT:    br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RDX_NEXT_LCSSA:%.*]] = phi float [ [[RDX_NEXT_3]], %[[LOOP]] ]
+; CHECK-NEXT:    ret float [[RDX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi float [ %start, %entry ], [ %rdx.next, %loop ]
+  %iv.next = add i64 %iv, 1
+  %gep.src = getelementptr float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 1
+  %rdx.next = fadd float %rdx, %l
+  %ec = icmp ne i64 %iv.next, 1000
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret float %rdx.next
+}
+
+define float @test_fadd_with_ressaoc(ptr %src, i64 %n, float %start) {
+; CHECK-LABEL: define float @test_fadd_with_ressaoc(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], float [[START:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi float [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 1
+; CHECK-NEXT:    [[RDX_NEXT:%.*]] = fadd float [[RDX]], [[L]]
+; CHECK-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr float, ptr [[SRC]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_1:%.*]] = fadd float [[RDX_NEXT]], [[L_1]]
+; CHECK-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr float, ptr [[SRC]], i64 [[IV_NEXT_1]]
+; CHECK-NEXT:    [[L_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_2:%.*]] = fadd float [[RDX_NEXT_1]], [[L_2]]
+; CHECK-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[GEP_SRC_24:%.*]] = getelementptr float, ptr [[SRC]], i64 [[IV_NEXT_2]]
+; CHECK-NEXT:    [[L_24:%.*]] = load float, ptr [[GEP_SRC_24]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_3]] = fadd float [[RDX_NEXT_2]], [[L_24]]
+; CHECK-NEXT:    [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
+; CHECK-NEXT:    br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RDX_NEXT_LCSSA:%.*]] = phi float [ [[RDX_NEXT_3]], %[[LOOP]] ]
+; CHECK-NEXT:    ret float [[RDX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi float [ %start, %entry ], [ %rdx.next, %loop ]
+  %iv.next = add i64 %iv, 1
+  %gep.src = getelementptr float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 1
+  %rdx.next = fadd float %rdx, %l
+  %ec = icmp ne i64 %iv.next, 1000
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret float %rdx.next
+}
+define i32 @test_smin(ptr %src, i64 %n) {
+; CHECK-LABEL: define i32 @test_smin(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MIN:%.*]] = phi i32 [ 1000, %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1
+; CHECK-NEXT:    [[RDX_NEXT:%.*]] = call i32 @llvm.smin.i32(i32 [[MIN]], i32 [[L]])
+; CHECK-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_1:%.*]] = call i32 @llvm.smin.i32(i32 [[RDX_NEXT]], i32 [[L_1]])
+; CHECK-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_1]]
+; CHECK-NEXT:    [[L_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_2:%.*]] = call i32 @llvm.smin.i32(i32 [[RDX_NEXT_1]], i32 [[L_2]])
+; CHECK-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[GEP_SRC_24:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_2]]
+; CHECK-NEXT:    [[L_24:%.*]] = load i32, ptr [[GEP_SRC_24]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_3]] = call i32 @llvm.smin.i32(i32 [[RDX_NEXT_2]], i32 [[L_24]])
+; CHECK-NEXT:    [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
+; CHECK-NEXT:    br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RDX_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[RDX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %min = phi i32 [ 1000, %entry ], [ %rdx.next, %loop ]
+  %iv.next = add i64 %iv, 1
+  %gep.src = getelementptr i32, ptr %src, i64 %iv
+  %l = load i32 , ptr %gep.src, align 1
+  %rdx.next = call i32 @llvm.smin(i32 %min, i32 %l)
+  %ec = icmp ne i64 %iv.next, 1000
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret i32 %rdx.next
+}
+
+define i64 @test_any_of_reduction(ptr %src, i64 %n) {
+; CHECK-LABEL: define i64 @test_any_of_reduction(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ANY_OF_RDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[L]], 0
+; CHECK-NEXT:    [[RDX_NEXT:%.*]] = select i1 [[C]], i64 [[ANY_OF_RDX]], i64 0
+; CHECK-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i8, ptr [[GEP_SRC_1]], align 1
+; CHECK-NEXT:    [[C_1:%.*]] = icmp eq i8 [[L_1]], 0
+; CHECK-NEXT:    [[RDX_NEXT_1:%.*]] = select i1 [[C_1]], i64 [[RDX_NEXT]], i64 0
+; CHECK-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV_NEXT_1]]
+; CHECK-NEXT:    [[L_2:%.*]] = load i8, ptr [[GEP_SRC_2]], align 1
+; CHECK-NEXT:    [[C_2:%.*]] = icmp eq i8 [[L_2]], 0
+; CHECK-NEXT:    [[RDX_NEXT_2:%.*]] = select i1 [[C_2]], i64 [[RDX_NEXT_1]], i64 0
+; CHECK-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[GEP_SRC_24:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV_NEXT_2]]
+; CHECK-NEXT:    [[L_24:%.*]] = load i8, ptr [[GEP_SRC_24]], align 1
+; CHECK-NEXT:    [[C_24:%.*]] = icmp eq i8 [[L_24]], 0
+; CHECK-NEXT:    [[RDX_NEXT_3]] = select i1 [[C_24]], i64 [[RDX_NEXT_2]], i64 0
+; CHECK-NEXT:    [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
+; CHECK-NEXT:    br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RDX_NEXT_LCSSA:%.*]] = phi i64 [ [[RDX_NEXT_3]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i64 [[RDX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+  %any.of.rdx = phi i64 [ %rdx.next, %loop ], [ 0, %entry ]
+  %iv.next = add i64 %iv, 1
+  %gep.src = getelementptr i8, ptr %src, i64 %iv
+  %l = load i8, ptr %gep.src, align 1
+  %c = icmp eq i8 %l, 0
+  %rdx.next = select i1 %c, i64 %any.of.rdx, i64 0
+  %ec = icmp ne i64 %iv.next, 1000
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret i64 %rdx.next
+}
diff --git a/llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll b/llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll
new file mode 100644
index 0000000..89f06ad
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll
@@ -0,0 +1,238 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-unroll -S %s | FileCheck %s
+
+define i32 @test_add_reduction(ptr %a, i64 %n) {
+; CHECK-LABEL: define i32 @test_add_reduction(
+; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[N]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 1
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
+; CHECK:       [[ENTRY_NEW]]:
+; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[GEP_A]], align 2
+; CHECK-NEXT:    [[RDX_NEXT:%.*]] = add nuw nsw i32 [[RDX]], [[TMP2]]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[GEP_A_1]], align 2
+; CHECK-NEXT:    [[RDX_NEXT_1]] = add nuw nsw i32 [[RDX_NEXT]], [[TMP3]]
+; CHECK-NEXT:    [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[NITER_NEXT_1]] = add i64 [[NITER]], 2
+; CHECK-NEXT:    [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
+; CHECK-NEXT:    br i1 [[NITER_NCMP_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[EXIT_UNR_LCSSA_LOOPEXIT]]:
+; CHECK-NEXT:    [[RES_PH_PH:%.*]] = phi i32 [ [[RDX_NEXT_1]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_1]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_UNR_PH:%.*]] = phi i32 [ [[RDX_NEXT_1]], %[[LOOP]] ]
+; CHECK-NEXT:    br label %[[EXIT_UNR_LCSSA]]
+; CHECK:       [[EXIT_UNR_LCSSA]]:
+; CHECK-NEXT:    [[RES_PH:%.*]] = phi i32 [ poison, %[[ENTRY]] ], [ [[RES_PH_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[RDX_UNR:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; CHECK-NEXT:    br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_EPIL_PREHEADER]]:
+; CHECK-NEXT:    br label %[[LOOP_EPIL:.*]]
+; CHECK:       [[LOOP_EPIL]]:
+; CHECK-NEXT:    [[GEP_A_EPIL:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_UNR]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[GEP_A_EPIL]], align 2
+; CHECK-NEXT:    [[RDX_NEXT_EPIL:%.*]] = add nuw nsw i32 [[RDX_UNR]], [[TMP4]]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[RES_PH]], %[[EXIT_UNR_LCSSA]] ], [ [[RDX_NEXT_EPIL]], %[[LOOP_EPIL]] ]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ 0, %entry ], [ %rdx.next, %loop ]
+  %gep.a = getelementptr inbounds nuw i32, ptr %a, i64 %iv
+  %1 = load i32, ptr %gep.a, align 2
+  %rdx.next = add nuw nsw i32 %rdx, %1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop, !llvm.loop !0
+
+exit:
+  %res = phi i32 [ %rdx.next, %loop ]
+  ret i32 %res
+}
+
+define i32 @test_add_reduction_constant_op(ptr %a, i64 %n) {
+; CHECK-LABEL: define i32 @test_add_reduction_constant_op(
+; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[N]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 1
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
+; CHECK:       [[ENTRY_NEW]]:
+; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_NEXT_1]] = add nuw nsw i32 [[RDX]], 2
+; CHECK-NEXT:    [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[NITER_NEXT_1]] = add i64 [[NITER]], 2
+; CHECK-NEXT:    [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
+; CHECK-NEXT:    br i1 [[NITER_NCMP_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:       [[EXIT_UNR_LCSSA_LOOPEXIT]]:
+; CHECK-NEXT:    [[RES_PH_PH:%.*]] = phi i32 [ [[RDX_NEXT_1]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_UNR_PH:%.*]] = phi i32 [ [[RDX_NEXT_1]], %[[LOOP]] ]
+; CHECK-NEXT:    br label %[[EXIT_UNR_LCSSA]]
+; CHECK:       [[EXIT_UNR_LCSSA]]:
+; CHECK-NEXT:    [[RES_PH:%.*]] = phi i32 [ poison, %[[ENTRY]] ], [ [[RES_PH_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[RDX_UNR:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; CHECK-NEXT:    br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_EPIL_PREHEADER]]:
+; CHECK-NEXT:    br label %[[LOOP_EPIL:.*]]
+; CHECK:       [[LOOP_EPIL]]:
+; CHECK-NEXT:    [[RDX_NEXT_EPIL:%.*]] = add nuw nsw i32 [[RDX_UNR]], 1
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[RES_PH]], %[[EXIT_UNR_LCSSA]] ], [ [[RDX_NEXT_EPIL]], %[[LOOP_EPIL]] ]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ 0, %entry ], [ %rdx.next, %loop ]
+  %rdx.next = add nuw nsw i32 %rdx, 1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop, !llvm.loop !0
+
+exit:
+  %res = phi i32 [ %rdx.next, %loop ]
+  ret i32 %res
+}
+
+define i32 @test_add_reduction_8x_unroll(ptr %a, i64 %n) {
+; CHECK-LABEL: define i32 @test_add_reduction_8x_unroll(
+; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[N]], 7
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 7
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
+; CHECK:       [[ENTRY_NEW]]:
+; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_7:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_7:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[GEP_A]], align 2
+; CHECK-NEXT:    [[RDX_NEXT:%.*]] = add nuw nsw i32 [[RDX]], [[TMP2]]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[GEP_A_1]], align 2
+; CHECK-NEXT:    [[RDX_2:%.*]] = add nuw nsw i32 [[RDX_NEXT]], [[TMP3]]
+; CHECK-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[GEP_A_2]], align 2
+; CHECK-NEXT:    [[RDX_NEXT_2:%.*]] = add nuw nsw i32 [[RDX_2]], [[TMP4]]
+; CHECK-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; CHECK-NEXT:    [[GEP_A_3:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[GEP_A_3]], align 2
+; CHECK-NEXT:    [[RDX_4:%.*]] = add nuw nsw i32 [[RDX_NEXT_2]], [[TMP5]]
+; CHECK-NEXT:    [[IV_NEXT_3:%.*]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[GEP_A_4:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[GEP_A_4]], align 2
+; CHECK-NEXT:    [[RDX_NEXT_4:%.*]] = add nuw nsw i32 [[RDX_4]], [[TMP6]]
+; CHECK-NEXT:    [[IV_NEXT_4:%.*]] = add nuw nsw i64 [[IV]], 5
+; CHECK-NEXT:    [[GEP_A_5:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[GEP_A_5]], align 2
+; CHECK-NEXT:    [[RDX_6:%.*]] = add nuw nsw i32 [[RDX_NEXT_4]], [[TMP7]]
+; CHECK-NEXT:    [[IV_NEXT_5:%.*]] = add nuw nsw i64 [[IV]], 6
+; CHECK-NEXT:    [[GEP_A_6:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[GEP_A_6]], align 2
+; CHECK-NEXT:    [[RDX_NEXT_6:%.*]] = add nuw nsw i32 [[RDX_6]], [[TMP8]]
+; CHECK-NEXT:    [[IV_NEXT_6:%.*]] = add nuw nsw i64 [[IV]], 7
+; CHECK-NEXT:    [[GEP_A_7:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[GEP_A_7]], align 2
+; CHECK-NEXT:    [[RDX_NEXT_7]] = add nuw nsw i32 [[RDX_NEXT_6]], [[TMP9]]
+; CHECK-NEXT:    [[IV_NEXT_7]] = add nuw nsw i64 [[IV]], 8
+; CHECK-NEXT:    [[NITER_NEXT_7]] = add i64 [[NITER]], 8
+; CHECK-NEXT:    [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
+; CHECK-NEXT:    br i1 [[NITER_NCMP_7]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT_UNR_LCSSA_LOOPEXIT]]:
+; CHECK-NEXT:    [[RES_PH_PH:%.*]] = phi i32 [ [[RDX_NEXT_7]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_7]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_UNR_PH:%.*]] = phi i32 [ [[RDX_NEXT_7]], %[[LOOP]] ]
+; CHECK-NEXT:    br label %[[EXIT_UNR_LCSSA]]
+; CHECK:       [[EXIT_UNR_LCSSA]]:
+; CHECK-NEXT:    [[RES_PH:%.*]] = phi i32 [ poison, %[[ENTRY]] ], [ [[RES_PH_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[RDX_UNR:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; CHECK-NEXT:    br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_EPIL_PREHEADER]]:
+; CHECK-NEXT:    br label %[[LOOP_EPIL:.*]]
+; CHECK:       [[LOOP_EPIL]]:
+; CHECK-NEXT:    [[IV_EPIL:%.*]] = phi i64 [ [[IV_UNR]], %[[LOOP_EPIL_PREHEADER]] ], [ [[IV_NEXT_EPIL:%.*]], %[[LOOP_EPIL]] ]
+; CHECK-NEXT:    [[RDX_EPIL:%.*]] = phi i32 [ [[RDX_UNR]], %[[LOOP_EPIL_PREHEADER]] ], [ [[RDX_NEXT_EPIL:%.*]], %[[LOOP_EPIL]] ]
+; CHECK-NEXT:    [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[LOOP_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[LOOP_EPIL]] ]
+; CHECK-NEXT:    [[GEP_A_EPIL:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_EPIL]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[GEP_A_EPIL]], align 2
+; CHECK-NEXT:    [[RDX_NEXT_EPIL]] = add nuw nsw i32 [[RDX_EPIL]], [[TMP10]]
+; CHECK-NEXT:    [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1
+; CHECK-NEXT:    [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[N]]
+; CHECK-NEXT:    [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
+; CHECK-NEXT:    [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
+; CHECK-NEXT:    br i1 [[EPIL_ITER_CMP]], label %[[LOOP_EPIL]], label %[[EXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[EXIT_EPILOG_LCSSA]]:
+; CHECK-NEXT:    [[RES_PH1:%.*]] = phi i32 [ [[RDX_NEXT_EPIL]], %[[LOOP_EPIL]] ]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[RES_PH]], %[[EXIT_UNR_LCSSA]] ], [ [[RES_PH1]], %[[EXIT_EPILOG_LCSSA]] ]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ 0, %entry ], [ %rdx.next, %loop ]
+  %gep.a = getelementptr inbounds nuw i32, ptr %a, i64 %iv
+  %1 = load i32, ptr %gep.a, align 2
+  %rdx.next = add nuw nsw i32 %rdx, %1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop, !llvm.loop !2
+
+exit:
+  %res = phi i32 [ %rdx.next, %loop ]
+  ret i32 %res
+}
+
+
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.unroll.count", i32 2}
+
+!2 = distinct !{!2, !3}
+!3 = !{!"llvm.loop.unroll.count", i32 8}
+
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.unroll.disable"}
+; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
index 451574a..427a05c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
@@ -42,18 +42,59 @@ define float @fmaxnum(ptr %src, i64 %n) {
 ; CHECK-LABEL: define float @fmaxnum(
 ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP7]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP8]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i1> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP6]], i64 [[IV]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP11]], <4 x float> [[TMP12]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[RDX_MINMAX_SELECT]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP15]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = and i1 [[CMP_N]], [[TMP16]]
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP14]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4
 ; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[MAX]], float [[L]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll
index e93ee55..1a8e594 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll
@@ -42,18 +42,59 @@ define float @fminnum(ptr %src, i64 %n) {
 ; CHECK-LABEL: define float @fminnum(
 ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP7]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP8]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i1> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP6]], i64 [[IV]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP11]], <4 x float> [[TMP12]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[RDX_MINMAX_SELECT]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP15]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = and i1 [[CMP_N]], [[TMP16]]
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP14]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4
 ; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.minnum.f32(float [[MAX]], float [[L]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll b/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll
new file mode 100644
index 0000000..2d15431
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll
@@ -0,0 +1,38 @@
+; REQUIRES: asserts
+; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -debug-only=loop-vectorize -disable-output -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-REGS-VP
+; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -debug-only=loop-vectorize -disable-output -force-target-num-vector-regs=1 -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-NOREGS-VP
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define i32 @dotp(ptr %a, ptr %b) #0 {
+; CHECK-REGS-VP-NOT: LV(REG): Not considering vector loop of width vscale x 16 because it uses too many registers
+; CHECK-REGS-VP: LV: Selecting VF: vscale x 8.
+;
+; CHECK-NOREGS-VP: LV(REG): Not considering vector loop of width vscale x 8 because it uses too many registers
+; CHECK-NOREGS-VP: LV(REG): Not considering vector loop of width vscale x 16 because it uses too many registers
+; CHECK-NOREGS-VP: LV: Selecting VF: vscale x 4.
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %sub = sub i32 0, %mul
+  %add = add i32 %accum, %sub
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret i32 %add
+}
+
+attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
index 173766cc..ccfa725 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
@@ -386,8 +386,7 @@ define void @single_fmul_used_by_each_member(ptr noalias %A, ptr noalias %B, ptr
 ; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX24:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT25:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr double, ptr [[A]], i64 [[INDEX24]]
-; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr double, ptr [[TMP45]], i32 0
-; CHECK-NEXT:    [[TMP47:%.*]] = load double, ptr [[TMP46]], align 8
+; CHECK-NEXT:    [[TMP47:%.*]] = load double, ptr [[TMP45]], align 8
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP47]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP48:%.*]] = fmul <2 x double> [[BROADCAST_SPLAT]], splat (double 5.000000e+00)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
index 813d61b..38e224f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
@@ -166,8 +166,7 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor) {
 ; VF2:       [[VECTOR_BODY]]:
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
-; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; VF2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP1]], align 8
 ; VF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
 ; VF2-NEXT:    [[WIDE_LOAD:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; VF2-NEXT:    [[TMP6:%.*]] = shl nsw i64 [[INDEX]], 1
@@ -959,13 +958,11 @@ define void @test_2xi64_sub_of_wide_loads(ptr noalias %data, ptr noalias %A, ptr
 ; VF2:       [[VECTOR_BODY]]:
 ; VF2-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[IV_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
-; VF2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[TMP6]], align 8
 ; VF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0
 ; VF2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; VF2-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8
 ; VF2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i64 0
 ; VF2-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; VF2-NEXT:    [[TMP13:%.*]] = sub <2 x i64> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT2]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
index f226ae9..cb7f0bf 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
@@ -18,8 +18,7 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP5]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
index b2e080f..a2eddad 100644
--- a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
@@ -42,18 +42,59 @@ define float @fmaxnum(ptr %src, i64 %n) {
 ; CHECK-LABEL: define float @fmaxnum(
 ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP7]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP8]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i1> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP6]], i64 [[IV]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP11]], <4 x float> [[TMP12]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[RDX_MINMAX_SELECT]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP15]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = and i1 [[CMP_N]], [[TMP16]]
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP14]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4
 ; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[MAX]], float [[L]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll
index 5661406..1ca5586 100644
--- a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll
@@ -192,18 +192,51 @@ define float @fmaxnum_1(ptr %src, i64 %n) {
 ; CHECK-LABEL: define float @fmaxnum_1(
 ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP4]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP3]], i64 [[IV]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP7]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = and i1 [[CMP_N]], [[TMP11]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4
 ; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[L]], float [[MAX]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
 ;
 entry:
@@ -227,18 +260,234 @@ define float @fmaxnum_2(ptr %src, i64 %n) {
 ; CHECK-LABEL: define float @fmaxnum_2(
 ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP4]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP3]], i64 [[IV]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP7]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = and i1 [[CMP_N]], [[TMP11]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4
+; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[MAX]], float [[L]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %max = phi float [ -1.000000e+07, %entry ], [ %max.next, %loop ]
+  %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 4
+  %max.next = call float @llvm.maxnum.f32(float %max, float %l)
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %max.next
+}
+
+define float @fmaxnum_induction_starts_at_10(ptr %src, i64 %n) {
+; CHECK-LABEL: define float @fmaxnum_induction_starts_at_10(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -10
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = add i64 10, [[INDEX]]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP6]], [[TMP4]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP6]], i64 [[INDEX]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP8]])
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 10, [[TMP9]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = and i1 [[CMP_N]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ 10, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4
+; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[L]], float [[MAX]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 10, %entry ], [ %iv.next, %loop ]
+  %max = phi float [ -1.000000e+07, %entry ], [ %max.next, %loop ]
+  %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 4
+  %max.next = call float @llvm.maxnum.f32(float %l, float %max)
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %max.next
+}
+
+define float @fmaxnum_induction_starts_at_value(ptr %src, i64 %start, i64 %n) {
+; CHECK-LABEL: define float @fmaxnum_induction_starts_at_value(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[START:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[N]], [[START]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = add i64 [[START]], [[INDEX]]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP6]], [[TMP4]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP6]], i64 [[INDEX]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP8]])
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[START]], [[TMP9]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = and i1 [[CMP_N]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4
+; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[L]], float [[MAX]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %start, %entry ], [ %iv.next, %loop ]
+  %max = phi float [ -1.000000e+07, %entry ], [ %max.next, %loop ]
+  %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 4
+  %max.next = call float @llvm.maxnum.f32(float %l, float %max)
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %max.next
+}
+
+define float @fmaxnum_with_additional_add(ptr noalias %src, ptr noalias %src.2, i64 %n) {
+; CHECK-LABEL: define float @fmaxnum_with_additional_add(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[SRC_2:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[SUM_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr inbounds nuw i32, ptr [[SRC_2]], i64 [[IV]]
+; CHECK-NEXT:    [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4
+; CHECK-NEXT:    [[SUM_NEXT]] = add i32 [[SUM]], [[L_SRC_2]]
 ; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
 ; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
-; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[MAX]], float [[L]])
+; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[L]], float [[MAX]])
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i32 [ [[SUM_NEXT]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    store i32 [[SUM_NEXT_LCSSA]], ptr [[SRC_2]], align 4
 ; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
 ;
 entry:
@@ -247,14 +496,19 @@ entry:
 loop:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
   %max = phi float [ -1.000000e+07, %entry ], [ %max.next, %loop ]
+  %sum = phi i32 [ 0, %entry ], [ %sum.next, %loop ]
+  %gep.src.2 = getelementptr inbounds nuw i32, ptr %src.2, i64 %iv
+  %l.src.2 = load i32, ptr %gep.src.2, align 4
+  %sum.next = add i32 %sum, %l.src.2
   %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
   %l = load float, ptr %gep.src, align 4
-  %max.next = call float @llvm.maxnum.f32(float %max, float %l)
+  %max.next = call float @llvm.maxnum.f32(float %l, float %max)
   %iv.next = add nuw nsw i64 %iv, 1
   %ec = icmp eq i64 %iv.next, %n
   br i1 %ec, label %exit, label %loop
 
 exit:
+  store i32 %sum.next, ptr %src.2
   ret float %max.next
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll
index 148beb6..68bc8d0 100644
--- a/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll
@@ -192,18 +192,51 @@ define float @fminnum_1(ptr %src, i64 %n) {
 ; CHECK-LABEL: define float @fminnum_1(
 ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP4]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP3]], i64 [[IV]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP7]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = and i1 [[CMP_N]], [[TMP11]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4
 ; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.minnum.f32(float [[L]], float [[MAX]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
 ;
 entry:
@@ -227,18 +260,51 @@ define float @fminnum_2(ptr %src, i64 %n) {
 ; CHECK-LABEL: define float @fminnum_2(
 ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP4]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP3]], i64 [[IV]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP7]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = and i1 [[CMP_N]], [[TMP11]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4
 ; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.minnum.f32(float [[MAX]], float [[L]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/minmax_reduction.ll b/llvm/test/Transforms/LoopVectorize/minmax_reduction.ll
index 85a90f2..e7ab02c 100644
--- a/llvm/test/Transforms/LoopVectorize/minmax_reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/minmax_reduction.ll
@@ -1001,8 +1001,10 @@ for.body:                                         ; preds = %entry, %for.body
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
+; This can be vectorized with additional runtime checks for NaNs.
 ; CHECK-LABEL: @fmin_intrinsic_nofast(
-; CHECK-NOT: <2 x float> @llvm.minnum.v2f32
+; CHECK: <2 x float> @llvm.minnum.v2f32
+; CHECK: fcmp uno <2 x float> [[OP:.+]], [[OP]]
 define float @fmin_intrinsic_nofast(ptr nocapture readonly %x) {
 entry:
   br label %for.body
@@ -1021,8 +1023,10 @@ for.body:                                         ; preds = %entry, %for.body
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
+; This can be vectorized with additional runtime checks for NaNs.
 ; CHECK-LABEL: @fmax_intrinsic_nofast(
-; CHECK-NOT: <2 x float> @llvm.maxnum.v2f32
+; CHECK: <2 x float> @llvm.maxnum.v2f32
+; CHECK: fcmp uno <2 x float> [[OP:.+]], [[OP]]
 define float @fmax_intrinsic_nofast(ptr nocapture readonly %x) {
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll
index f7e629f..d4e3238 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S --passes=slp-vectorizer -mtriple=riscv64-unknown-linux-gnu -mcpu=sifive-x280 < %s | FileCheck %s
+; RUN: opt -S --passes=slp-vectorizer -mtriple=riscv64-unknown-linux-gnu -mcpu=sifive-x280 < %s -slp-threshold=-3 | FileCheck %s --check-prefix=THRESH
 
 %struct.ImageParameters = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, ptr, ptr, i32, ptr, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [9 x [16 x [16 x i16]]], [5 x [16 x [16 x i16]]], [9 x [8 x [8 x i16]]], [2 x [4 x [16 x [16 x i16]]]], [16 x [16 x i16]], [16 x [16 x i32]], ptr, ptr, ptr, ptr, ptr, [1200 x %struct.syntaxelement], ptr, ptr, i32, i32, i32, i32, [4 x [4 x i32]], i32, i32, i32, i32, i32, double, i32, i32, i32, i32, ptr, ptr, ptr, ptr, [15 x i16], i32, i32, i32, i32, i32, i32, i32, i32, [6 x [15 x i32]], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [1 x i32], i32, i32, [2 x i32], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, ptr, i32, i32, i32, i32, i32, double, i32, i32, i32, i32, i32, i32, i32, ptr, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [2 x i32], i32, i32, i32 }
 %struct.syntaxelement = type { i32, i32, i32, i32, i32, i32, i32, i32, ptr, ptr }
@@ -94,6 +95,89 @@ define fastcc i32 @test(i32 %0, i32 %add111.i.i, <4 x i32> %PredPel.i.sroa.86.72
 ; CHECK-NEXT:    store i16 [[CONV2206_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8218), align 2
 ; CHECK-NEXT:    ret i32 0
 ;
+; THRESH-LABEL: define fastcc i32 @test(
+; THRESH-SAME: i32 [[TMP0:%.*]], i32 [[ADD111_I_I:%.*]], <4 x i32> [[PREDPEL_I_SROA_86_72_VEC_EXTRACT:%.*]], <4 x i32> [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+; THRESH-NEXT:  [[ENTRY:.*:]]
+; THRESH-NEXT:    [[LOOPARRAY_SROA_24_0_I_I3:%.*]] = ashr i32 [[TMP0]], 1
+; THRESH-NEXT:    [[SHR143_5_I_I9:%.*]] = ashr i32 [[TMP0]], 1
+; THRESH-NEXT:    [[ADD1392_I:%.*]] = add i32 [[TMP0]], 1
+; THRESH-NEXT:    [[MUL1445_I:%.*]] = shl i32 [[TMP0]], 1
+; THRESH-NEXT:    [[ADD2136_I:%.*]] = or i32 [[LOOPARRAY_SROA_24_0_I_I3]], [[TMP0]]
+; THRESH-NEXT:    [[SHR2137_I:%.*]] = lshr i32 [[ADD2136_I]], 1
+; THRESH-NEXT:    [[CONV2138_I:%.*]] = trunc i32 [[SHR2137_I]] to i16
+; THRESH-NEXT:    [[ADD2174_I:%.*]] = add i32 [[MUL1445_I]], 2
+; THRESH-NEXT:    [[SHR2175_I:%.*]] = lshr i32 [[ADD2174_I]], 2
+; THRESH-NEXT:    [[CONV2176_I:%.*]] = trunc i32 [[SHR2175_I]] to i16
+; THRESH-NEXT:    [[ADD2190_I:%.*]] = or i32 [[ADD1392_I]], 1
+; THRESH-NEXT:    [[ADD2191_I:%.*]] = add i32 [[ADD2190_I]], [[TMP0]]
+; THRESH-NEXT:    [[CONV2193_I:%.*]] = trunc i32 [[ADD2191_I]] to i16
+; THRESH-NEXT:    [[ADD2203_I:%.*]] = or i32 [[TMP0]], 1
+; THRESH-NEXT:    [[ADD2204_I:%.*]] = add i32 [[ADD2203_I]], [[TMP0]]
+; THRESH-NEXT:    [[CONV2206_I:%.*]] = trunc i32 [[ADD2204_I]] to i16
+; THRESH-NEXT:    [[ADD2235_I16:%.*]] = or i32 [[TMP0]], 1
+; THRESH-NEXT:    [[ADD2236_I:%.*]] = add i32 [[ADD2235_I16]], 1
+; THRESH-NEXT:    [[SHR2237_I:%.*]] = lshr i32 [[ADD2236_I]], 1
+; THRESH-NEXT:    [[CONV2238_I:%.*]] = trunc i32 [[SHR2237_I]] to i16
+; THRESH-NEXT:    store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8196), align 4
+; THRESH-NEXT:    store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8176), align 8
+; THRESH-NEXT:    [[ADD2258_I:%.*]] = or i32 [[ADD111_I_I]], [[TMP0]]
+; THRESH-NEXT:    [[SHR2259_I:%.*]] = lshr i32 [[ADD2258_I]], 1
+; THRESH-NEXT:    [[CONV2260_I:%.*]] = trunc i32 [[SHR2259_I]] to i16
+; THRESH-NEXT:    store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8212), align 4
+; THRESH-NEXT:    store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8192), align 8
+; THRESH-NEXT:    store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8172), align 4
+; THRESH-NEXT:    [[ADD2302_I:%.*]] = add i32 [[TMP0]], 1
+; THRESH-NEXT:    [[SHR2303_I:%.*]] = lshr i32 [[ADD2302_I]], 1
+; THRESH-NEXT:    [[CONV2304_I:%.*]] = trunc i32 [[SHR2303_I]] to i16
+; THRESH-NEXT:    store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8224), align 8
+; THRESH-NEXT:    store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8204), align 4
+; THRESH-NEXT:    store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8184), align 8
+; THRESH-NEXT:    [[ADD2323_I:%.*]] = add i32 [[TMP0]], 1
+; THRESH-NEXT:    [[ADD2324_I:%.*]] = or i32 [[ADD2323_I]], [[TMP0]]
+; THRESH-NEXT:    [[SHR2325_I:%.*]] = lshr i32 [[ADD2324_I]], 1
+; THRESH-NEXT:    [[CONV2326_I:%.*]] = trunc i32 [[SHR2325_I]] to i16
+; THRESH-NEXT:    store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8220), align 4
+; THRESH-NEXT:    store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8200), align 8
+; THRESH-NEXT:    [[ADD2342_I:%.*]] = add i32 [[SHR143_5_I_I9]], 1
+; THRESH-NEXT:    [[SHR2343_I:%.*]] = lshr i32 [[ADD2342_I]], 1
+; THRESH-NEXT:    [[CONV2344_I:%.*]] = trunc i32 [[SHR2343_I]] to i16
+; THRESH-NEXT:    store i16 [[CONV2344_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8216), align 8
+; THRESH-NEXT:    [[ADD2355_I:%.*]] = or i32 [[SHR143_5_I_I9]], 1
+; THRESH-NEXT:    [[ADD2356_I:%.*]] = add i32 [[ADD2355_I]], [[TMP0]]
+; THRESH-NEXT:    [[CONV2358_I:%.*]] = trunc i32 [[ADD2356_I]] to i16
+; THRESH-NEXT:    store i16 [[CONV2358_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8232), align 8
+; THRESH-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; THRESH-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 0
+; THRESH-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[TMP3]], splat (i32 1)
+; THRESH-NEXT:    [[TMP5:%.*]] = lshr <2 x i32> [[TMP4]], splat (i32 1)
+; THRESH-NEXT:    [[TMP6:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16>
+; THRESH-NEXT:    store <2 x i16> [[TMP6]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8180), align 4
+; THRESH-NEXT:    [[ADD2393_I:%.*]] = or i32 [[LOOPARRAY_SROA_24_0_I_I3]], 1
+; THRESH-NEXT:    [[ADD2394_I:%.*]] = add i32 [[ADD2393_I]], [[TMP0]]
+; THRESH-NEXT:    [[CONV2396_I:%.*]] = trunc i32 [[ADD2394_I]] to i16
+; THRESH-NEXT:    store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8198), align 2
+; THRESH-NEXT:    store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8178), align 2
+; THRESH-NEXT:    store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8214), align 2
+; THRESH-NEXT:    store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8194), align 2
+; THRESH-NEXT:    store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8174), align 2
+; THRESH-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[PREDPEL_I_SROA_86_72_VEC_EXTRACT]], <4 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; THRESH-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[ADD111_I_I]], i32 0
+; THRESH-NEXT:    [[TMP9:%.*]] = add <2 x i32> [[TMP8]], splat (i32 1)
+; THRESH-NEXT:    [[TMP10:%.*]] = lshr <2 x i32> [[TMP9]], splat (i32 1)
+; THRESH-NEXT:    [[TMP11:%.*]] = trunc <2 x i32> [[TMP10]] to <2 x i16>
+; THRESH-NEXT:    [[TMP12:%.*]] = extractelement <2 x i16> [[TMP11]], i32 1
+; THRESH-NEXT:    store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8228), align 4
+; THRESH-NEXT:    store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8208), align 8
+; THRESH-NEXT:    store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8188), align 4
+; THRESH-NEXT:    store i16 [[TMP12]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8170), align 2
+; THRESH-NEXT:    store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8226), align 2
+; THRESH-NEXT:    store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8206), align 2
+; THRESH-NEXT:    store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8186), align 2
+; THRESH-NEXT:    store i16 [[CONV2193_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8222), align 2
+; THRESH-NEXT:    store i16 [[CONV2193_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8202), align 2
+; THRESH-NEXT:    store i16 [[CONV2206_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8218), align 2
+; THRESH-NEXT:    ret i32 0
+;
 entry:
   %LoopArray.sroa.24.0.i.i3 = ashr i32 %0, 1
   %shr143.5.i.i9 = ashr i32 %0, 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-node-reorder-node-with-ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-node-reorder-node-with-ops.ll
index 8e09847..cfff117 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/split-node-reorder-node-with-ops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/split-node-reorder-node-with-ops.ll
@@ -58,7 +58,6 @@ define void @test(i32 %0, i8 %1, i64 %2, float %3) {
 ; CHECK-NEXT:    br label %[[BB54:.*]]
 ; CHECK:       [[BB54]]:
 ; CHECK-NEXT:    [[TMP54:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[TMP17]])
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 0, ptr null)
 ; CHECK-NEXT:    [[TMP55:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[TMP21]])
 ; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP57:%.*]] = shufflevector <8 x float> [[TMP56]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison>
@@ -198,7 +197,6 @@ define void @test(i32 %0, i8 %1, i64 %2, float %3) {
   %95 = or i64 %94, %91
   %96 = or i64 %95, %37
   store i64 %96, ptr null, align 1
-  call void @llvm.lifetime.start.p0(i64 0, ptr null)
   store i64 %42, ptr null, align 1
   %97 = bitcast float %3 to i32
   %98 = icmp ult i32 %97, 1325400064
diff --git a/llvm/test/Transforms/SimplifyCFG/merge-callee-type-metadata.ll b/llvm/test/Transforms/SimplifyCFG/merge-callee-type-metadata.ll
new file mode 100644
index 0000000..3e56939
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/merge-callee-type-metadata.ll
@@ -0,0 +1,167 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
+;; Test if the callee_type metadata is merged correctly.
+
+; RUN: opt -passes=simplifycfg -S < %s | FileCheck %s
+
+;; Test if the callee_type metadata is merged correctly when
+;; the instructions carry differring callee_type metadata.
+define ptr @_Z10test_diffb(i1 zeroext %b) {
+; CHECK-LABEL: define ptr @_Z10test_diffb(
+; CHECK-SAME: i1 zeroext [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[FN:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    store ptr @_Znwm, ptr [[FN]], align 8
+; CHECK-NEXT:    [[CALL:%.*]] = call ptr [[FN]](i64 4), !callee_type [[META0:![0-9]+]]
+; CHECK-NEXT:    ret ptr [[CALL]]
+;
+entry:
+  %fn = alloca ptr
+  store ptr @_Znwm, ptr %fn
+  br i1 %b, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %call = call ptr %fn(i64 4), !callee_type !4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %call1 = call ptr %fn(i64 4), !callee_type !3
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %x.0 = phi ptr [ %call, %if.then ], [ %call1, %if.else ]
+  ret ptr %x.0
+}
+
+;; Test if the callee_type metadata is merged correctly when
+;; the instructions carry same callee_type metadata.
+define ptr @_Z10test_sameb(i1 zeroext %b) {
+; CHECK-LABEL: define ptr @_Z10test_sameb(
+; CHECK-SAME: i1 zeroext [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[FN:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    store ptr @_Znwm, ptr [[FN]], align 8
+; CHECK-NEXT:    [[CALL:%.*]] = call ptr [[FN]](i64 4), !callee_type [[META3:![0-9]+]]
+; CHECK-NEXT:    ret ptr [[CALL]]
+;
+entry:
+  %fn = alloca ptr
+  store ptr @_Znwm, ptr %fn
+  br i1 %b, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %call = call ptr %fn(i64 4), !callee_type !3
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %call1 = call ptr %fn(i64 4), !callee_type !3
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %x.0 = phi ptr [ %call, %if.then ], [ %call1, %if.else ]
+  ret ptr %x.0
+}
+
+;; Test if the callee_type metadata is dropped correctly when
+;; only the left instruction has callee_type metadata.
+define ptr @_Z10test_leftb(i1 zeroext %b) {
+; CHECK-LABEL: define ptr @_Z10test_leftb(
+; CHECK-SAME: i1 zeroext [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[FN:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    store ptr @_Znwm, ptr [[FN]], align 8
+; CHECK-NEXT:    [[CALL:%.*]] = call ptr [[FN]](i64 4)
+; CHECK-NEXT:    ret ptr [[CALL]]
+;
+entry:
+  %fn = alloca ptr
+  store ptr @_Znwm, ptr %fn
+  br i1 %b, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %call = call ptr %fn(i64 4), !callee_type !4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %call1 = call ptr %fn(i64 4)
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %x.0 = phi ptr [ %call, %if.then ], [ %call1, %if.else ]
+  ret ptr %x.0
+}
+
+;; Test if the callee_type metadata is dropped correctly when
+;; only the right instruction has callee_type metadata.
+define ptr @_Z10test_rightb(i1 zeroext %b) {
+; CHECK-LABEL: define ptr @_Z10test_rightb(
+; CHECK-SAME: i1 zeroext [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[FN:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    store ptr @_Znwm, ptr [[FN]], align 8
+; CHECK-NEXT:    [[CALL:%.*]] = call ptr [[FN]](i64 4)
+; CHECK-NEXT:    ret ptr [[CALL]]
+;
+entry:
+  %fn = alloca ptr
+  store ptr @_Znwm, ptr %fn
+  br i1 %b, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %call = call ptr %fn(i64 4)
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %call1 = call ptr %fn(i64 4), !callee_type !3
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %x.0 = phi ptr [ %call, %if.then ], [ %call1, %if.else ]
+  ret ptr %x.0
+}
+
+;; Test if the callee_type metadata is merged correctly when
+;; each of the callee_type metadata are lists.
+define ptr @_Z10test_listb(i1 zeroext %b) {
+; CHECK-LABEL: define ptr @_Z10test_listb(
+; CHECK-SAME: i1 zeroext [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[FN:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    store ptr @_Znwm, ptr [[FN]], align 8
+; CHECK-NEXT:    [[CALL:%.*]] = call ptr [[FN]](i64 4), !callee_type [[META4:![0-9]+]]
+; CHECK-NEXT:    ret ptr [[CALL]]
+;
+entry:
+  %fn = alloca ptr
+  store ptr @_Znwm, ptr %fn
+  br i1 %b, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %call = call ptr %fn(i64 4), !callee_type !6
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %call1 = call ptr %fn(i64 4), !callee_type !5
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %x.0 = phi ptr [ %call, %if.then ], [ %call1, %if.else ]
+  ret ptr %x.0
+}
+
+declare ptr @_Znwm(i64)
+
+!0 = !{i64 0, !"callee_type0.generalized"}
+!1 = !{i64 0, !"callee_type1.generalized"}
+!2 = !{i64 0, !"callee_type2.generalized"}
+!3 = !{!0}
+!4 = !{!2}
+!5 = !{!1, !2}
+!6 = !{!0, !2}
+;.
+; CHECK: [[META0]] = !{[[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{i64 0, !"callee_type2.generalized"}
+; CHECK: [[META2]] = !{i64 0, !"callee_type0.generalized"}
+; CHECK: [[META3]] = !{[[META2]]}
+; CHECK: [[META4]] = !{[[META2]], [[META1]], [[META5:![0-9]+]]}
+; CHECK: [[META5]] = !{i64 0, !"callee_type1.generalized"}
+;.
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-dup-bbs.ll b/llvm/test/Transforms/SimplifyCFG/switch-dup-bbs.ll
index 32581bb..d2d917d 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch-dup-bbs.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch-dup-bbs.ll
@@ -199,3 +199,44 @@ exit:
   %ret = phi i64 [ 0, %default ], [ 0, %bb1 ], [ 1, %entry ], [ 1, %bb2 ]
   ret i64 %ret
 }
+
+define i32 @switch_dup_unbounded_predecessors(i32 %val) {
+; SIMPLIFY-CFG-LABEL: define i32 @switch_dup_unbounded_predecessors(
+; SIMPLIFY-CFG-SAME: i32 [[VAL:%.*]]) {
+; SIMPLIFY-CFG-NEXT:  [[ENTRY:.*]]:
+; SIMPLIFY-CFG-NEXT:    switch i32 [[VAL]], label %[[EXIT:.*]] [
+; SIMPLIFY-CFG-NEXT:      i32 99, label %[[BB1:.*]]
+; SIMPLIFY-CFG-NEXT:      i32 115, label %[[BB1]]
+; SIMPLIFY-CFG-NEXT:      i32 102, label %[[BB1]]
+; SIMPLIFY-CFG-NEXT:      i32 70, label %[[BB1]]
+; SIMPLIFY-CFG-NEXT:      i32 101, label %[[BB1]]
+; SIMPLIFY-CFG-NEXT:      i32 69, label %[[BB1]]
+; SIMPLIFY-CFG-NEXT:      i32 103, label %[[BB1]]
+; SIMPLIFY-CFG-NEXT:    ]
+; SIMPLIFY-CFG:       [[BB1]]:
+; SIMPLIFY-CFG-NEXT:    br label %[[EXIT]]
+; SIMPLIFY-CFG:       [[EXIT]]:
+; SIMPLIFY-CFG-NEXT:    [[PHI:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ 1, %[[BB1]] ]
+; SIMPLIFY-CFG-NEXT:    ret i32 [[PHI]]
+;
+entry:
+  switch i32 %val, label %exit [
+  i32 99, label %bb1
+  i32 115, label %bb1
+  i32 102, label %bb2
+  i32 70, label %bb2
+  i32 101, label %bb2
+  i32 69, label %bb2
+  i32 103, label %bb2
+  ]
+
+bb1:
+  br label %exit
+
+bb2:
+  br label %exit
+
+exit:
+  %phi = phi i32 [ 0, %entry ], [ 1, %bb1 ], [ 1, %bb2 ]
+  ret i32 %phi
+}
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll b/llvm/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll
index 4136f33..8f2ae2d 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll
@@ -149,7 +149,7 @@ unreach2:
 
 define void @pr53208_single_reachable_dest(i8 %sw, ptr %p0) {
 ; CHECK-LABEL: @pr53208_single_reachable_dest(
-; CHECK-NEXT:  group2:
+; CHECK-NEXT:  exit:
 ; CHECK-NEXT:    call void @bar(ptr [[P0:%.*]])
 ; CHECK-NEXT:    ret void
 ;