54 files changed, 4636 insertions, 233 deletions
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/range.ll b/llvm/test/Transforms/CorrelatedValuePropagation/range.ll
index cc66cbe..ce1b591 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/range.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/range.ll
@@ -102,9 +102,9 @@ if.end8:
 define i32 @test4(i32 %c) nounwind {
 ; CHECK-LABEL: @test4(
 ; CHECK-NEXT:    switch i32 [[C:%.*]], label [[SW_DEFAULT:%.*]] [
-; CHECK-NEXT:    i32 1, label [[SW_BB:%.*]]
-; CHECK-NEXT:    i32 2, label [[SW_BB]]
-; CHECK-NEXT:    i32 4, label [[SW_BB]]
+; CHECK-NEXT:      i32 1, label [[SW_BB:%.*]]
+; CHECK-NEXT:      i32 2, label [[SW_BB]]
+; CHECK-NEXT:      i32 4, label [[SW_BB]]
 ; CHECK-NEXT:    ]
 ; CHECK:       sw.bb:
 ; CHECK-NEXT:    br i1 true, label [[IF_THEN:%.*]], label [[IF_END:%.*]]
@@ -207,8 +207,8 @@ define i1 @test7(i32 %c) nounwind {
 ; CHECK-LABEL: @test7(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    switch i32 [[C:%.*]], label [[SW_DEFAULT:%.*]] [
-; CHECK-NEXT:    i32 6, label [[SW_BB:%.*]]
-; CHECK-NEXT:    i32 7, label [[SW_BB]]
+; CHECK-NEXT:      i32 6, label [[SW_BB:%.*]]
+; CHECK-NEXT:      i32 7, label [[SW_BB]]
 ; CHECK-NEXT:    ]
 ; CHECK:       sw.bb:
 ; CHECK-NEXT:    ret i1 true
@@ -790,8 +790,8 @@ define i32 @test18(i8 %a) {
 ; CHECK-NEXT:    br label [[DISPATCH:%.*]]
 ; CHECK:       dispatch:
 ; CHECK-NEXT:    switch i8 [[A]], label [[DISPATCH]] [
-; CHECK-NEXT:    i8 93, label [[TARGET93:%.*]]
-; CHECK-NEXT:    i8 -111, label [[DISPATCH]]
+; CHECK-NEXT:      i8 93, label [[TARGET93:%.*]]
+; CHECK-NEXT:      i8 -111, label [[DISPATCH]]
 ; CHECK-NEXT:    ]
 ; CHECK:       target93:
 ; CHECK-NEXT:    ret i32 93
@@ -817,8 +817,8 @@ define i8 @test19(i8 %a) {
 ; CHECK-NEXT:    br label [[DISPATCH:%.*]]
 ; CHECK:       dispatch:
 ; CHECK-NEXT:    switch i8 [[A]], label [[DISPATCH]] [
-; CHECK-NEXT:    i8 93, label [[TARGET93:%.*]]
-; CHECK-NEXT:    i8 -111, label [[DISPATCH]]
+; CHECK-NEXT:      i8 93, label [[TARGET93:%.*]]
+; CHECK-NEXT:      i8 -111, label [[DISPATCH]]
 ; CHECK-NEXT:    ]
 ; CHECK:       target93:
 ; CHECK-NEXT:    ret i8 96
@@ -846,8 +846,8 @@ define i1 @test20(i64 %a) {
 ; CHECK-NEXT:    br label [[DISPATCH:%.*]]
 ; CHECK:       dispatch:
 ; CHECK-NEXT:    switch i64 [[A]], label [[DEFAULT:%.*]] [
-; CHECK-NEXT:    i64 0, label [[EXIT2:%.*]]
-; CHECK-NEXT:    i64 -2147483647, label [[EXIT2]]
+; CHECK-NEXT:      i64 0, label [[EXIT2:%.*]]
+; CHECK-NEXT:      i64 -2147483647, label [[EXIT2]]
 ; CHECK-NEXT:    ]
 ; CHECK:       default:
 ; CHECK-NEXT:    [[C:%.*]] = icmp eq i64 [[B]], 0
@@ -1123,6 +1123,70 @@ else:
   ret i1 true
 }
 
+define i1 @icmp_eq_range_attr(i8 range(i8 1, 0) %i) {
+; CHECK-LABEL: @icmp_eq_range_attr(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = icmp eq i8 %i, 0
+  ret i1 %cmp
+}
+
+define i1 @neg_icmp_eq_range_attr(i8 range(i8 -1, 1) %i) {
+; CHECK-LABEL: @neg_icmp_eq_range_attr(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[I:%.*]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %cmp = icmp eq i8 %i, 0
+  ret i1 %cmp
+}
+
+declare range(i8 1, 0) i8 @returns_non_zero_range_helper()
+declare range(i8 -1, 1) i8 @returns_contain_zero_range_helper()
+
+define i1 @icmp_eq_range_return() {
+; CHECK-LABEL: @icmp_eq_range_return(
+; CHECK-NEXT:    [[I:%.*]] = call i8 @returns_non_zero_range_helper()
+; CHECK-NEXT:    ret i1 false
+;
+  %i = call i8 @returns_non_zero_range_helper()
+  %cmp = icmp eq i8 %i, 0
+  ret i1 %cmp
+}
+
+define i1 @neg_icmp_eq_range_return() {
+; CHECK-LABEL: @neg_icmp_eq_range_return(
+; CHECK-NEXT:    [[I:%.*]] = call i8 @returns_contain_zero_range_helper()
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[I]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i = call i8 @returns_contain_zero_range_helper()
+  %cmp = icmp eq i8 %i, 0
+  ret i1 %cmp
+}
+
+declare i8 @returns_i8_helper()
+
+define i1 @icmp_eq_range_call() {
+; CHECK-LABEL: @icmp_eq_range_call(
+; CHECK-NEXT:    [[I:%.*]] = call range(i8 1, 0) i8 @returns_i8_helper()
+; CHECK-NEXT:    ret i1 false
+;
+  %i = call range(i8 1, 0) i8 @returns_i8_helper()
+  %cmp = icmp eq i8 %i, 0
+  ret i1 %cmp
+}
+
+define i1 @neg_icmp_eq_range_call() {
+; CHECK-LABEL: @neg_icmp_eq_range_call(
+; CHECK-NEXT:    [[I:%.*]] = call range(i8 0, 11) i8 @returns_i8_helper()
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[I]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i = call range(i8 0, 11) i8 @returns_i8_helper()
+  %cmp = icmp eq i8 %i, 0
+  ret i1 %cmp
+}
+
 declare i16 @llvm.ctlz.i16(i16, i1)
 declare i16 @llvm.cttz.i16(i16, i1)
 declare i16 @llvm.ctpop.i16(i16)
diff --git a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-fptosi129.ll b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-fptosi129.ll
index 75130c2..e058c5b 100644
--- a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-fptosi129.ll
+++ b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-fptosi129.ll
@@ -176,3 +176,80 @@ define i129 @fp128tosi129(fp128 %a) {
   %conv = fptosi fp128 %a to i129
   ret i129 %conv
 }
+
+define <2 x i129> @floattosi129v2(<2 x float> %a) {
+; CHECK-LABEL: @floattosi129v2(
+; CHECK-NEXT:  fp-to-i-entryfp-to-i-entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x float> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i129
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[TMP1]], -1
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i129 1, i129 -1
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i129 [[TMP2]], 23
+; CHECK-NEXT:    [[TMP6:%.*]] = and i129 [[TMP5]], 255
+; CHECK-NEXT:    [[TMP7:%.*]] = and i129 [[TMP2]], 8388607
+; CHECK-NEXT:    [[TMP8:%.*]] = or i129 [[TMP7]], 8388608
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ult i129 [[TMP6]], 127
+; CHECK-NEXT:    br i1 [[TMP9]], label [[FP_TO_I_CLEANUP1:%.*]], label [[FP_TO_I_IF_END2:%.*]]
+; CHECK:       fp-to-i-if-end2:
+; CHECK-NEXT:    [[TMP10:%.*]] = add i129 [[TMP6]], -256
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ult i129 [[TMP10]], -129
+; CHECK-NEXT:    br i1 [[TMP11]], label [[FP_TO_I_IF_THEN53:%.*]], label [[FP_TO_I_IF_END94:%.*]]
+; CHECK:       fp-to-i-if-then53:
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP3]], i129 340282366920938463463374607431768211455, i129 -340282366920938463463374607431768211456
+; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP1]]
+; CHECK:       fp-to-i-if-end94:
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ult i129 [[TMP6]], 150
+; CHECK-NEXT:    br i1 [[TMP13]], label [[FP_TO_I_IF_THEN125:%.*]], label [[FP_TO_I_IF_ELSE6:%.*]]
+; CHECK:       fp-to-i-if-then125:
+; CHECK-NEXT:    [[TMP14:%.*]] = sub i129 150, [[TMP6]]
+; CHECK-NEXT:    [[TMP15:%.*]] = lshr i129 [[TMP8]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i129 [[TMP15]], [[TMP4]]
+; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP1]]
+; CHECK:       fp-to-i-if-else6:
+; CHECK-NEXT:    [[TMP17:%.*]] = add i129 [[TMP6]], -150
+; CHECK-NEXT:    [[TMP18:%.*]] = shl i129 [[TMP8]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = mul i129 [[TMP18]], [[TMP4]]
+; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP1]]
+; CHECK:       fp-to-i-cleanup1:
+; CHECK-NEXT:    [[TMP20:%.*]] = phi i129 [ [[TMP12]], [[FP_TO_I_IF_THEN53]] ], [ [[TMP16]], [[FP_TO_I_IF_THEN125]] ], [ [[TMP19]], [[FP_TO_I_IF_ELSE6]] ], [ 0, [[FP_TO_I_ENTRYFP_TO_I_ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <2 x i129> poison, i129 [[TMP20]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[A]], i64 1
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast float [[TMP22]] to i32
+; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP23]] to i129
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], -1
+; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i129 1, i129 -1
+; CHECK-NEXT:    [[TMP27:%.*]] = lshr i129 [[TMP24]], 23
+; CHECK-NEXT:    [[TMP28:%.*]] = and i129 [[TMP27]], 255
+; CHECK-NEXT:    [[TMP29:%.*]] = and i129 [[TMP24]], 8388607
+; CHECK-NEXT:    [[TMP30:%.*]] = or i129 [[TMP29]], 8388608
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ult i129 [[TMP28]], 127
+; CHECK-NEXT:    br i1 [[TMP31]], label [[FP_TO_I_CLEANUP:%.*]], label [[FP_TO_I_IF_END:%.*]]
+; CHECK:       fp-to-i-if-end:
+; CHECK-NEXT:    [[TMP32:%.*]] = add i129 [[TMP28]], -256
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp ult i129 [[TMP32]], -129
+; CHECK-NEXT:    br i1 [[TMP33]], label [[FP_TO_I_IF_THEN5:%.*]], label [[FP_TO_I_IF_END9:%.*]]
+; CHECK:       fp-to-i-if-then5:
+; CHECK-NEXT:    [[TMP34:%.*]] = select i1 [[TMP25]], i129 340282366920938463463374607431768211455, i129 -340282366920938463463374607431768211456
+; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
+; CHECK:       fp-to-i-if-end9:
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ult i129 [[TMP28]], 150
+; CHECK-NEXT:    br i1 [[TMP35]], label [[FP_TO_I_IF_THEN12:%.*]], label [[FP_TO_I_IF_ELSE:%.*]]
+; CHECK:       fp-to-i-if-then12:
+; CHECK-NEXT:    [[TMP36:%.*]] = sub i129 150, [[TMP28]]
+; CHECK-NEXT:    [[TMP37:%.*]] = lshr i129 [[TMP30]], [[TMP36]]
+; CHECK-NEXT:    [[TMP38:%.*]] = mul i129 [[TMP37]], [[TMP26]]
+; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
+; CHECK:       fp-to-i-if-else:
+; CHECK-NEXT:    [[TMP39:%.*]] = add i129 [[TMP28]], -150
+; CHECK-NEXT:    [[TMP40:%.*]] = shl i129 [[TMP30]], [[TMP39]]
+; CHECK-NEXT:    [[TMP41:%.*]] = mul i129 [[TMP40]], [[TMP26]]
+; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
+; CHECK:       fp-to-i-cleanup:
+; CHECK-NEXT:    [[TMP42:%.*]] = phi i129 [ [[TMP34]], [[FP_TO_I_IF_THEN5]] ], [ [[TMP38]], [[FP_TO_I_IF_THEN12]] ], [ [[TMP41]], [[FP_TO_I_IF_ELSE]] ], [ 0, [[FP_TO_I_CLEANUP1]] ]
+; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <2 x i129> [[TMP21]], i129 [[TMP42]], i64 1
+; CHECK-NEXT:    ret <2 x i129> [[TMP43]]
+;
+  %conv = fptosi <2 x float> %a to <2 x i129>
+  ret <2 x i129> %conv
+}
diff --git a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-fptoui129.ll b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-fptoui129.ll
index ed630d7..c699f80 100644
--- a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-fptoui129.ll
+++ b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-fptoui129.ll
@@ -176,3 +176,80 @@ define i129 @fp128toui129(fp128 %a) {
   %conv = fptoui fp128 %a to i129
   ret i129 %conv
 }
+
+define <2 x i129> @floattoui129v2(<2 x float> %a) {
+; CHECK-LABEL: @floattoui129v2(
+; CHECK-NEXT:  fp-to-i-entryfp-to-i-entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x float> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i129
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[TMP1]], -1
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i129 1, i129 -1
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i129 [[TMP2]], 23
+; CHECK-NEXT:    [[TMP6:%.*]] = and i129 [[TMP5]], 255
+; CHECK-NEXT:    [[TMP7:%.*]] = and i129 [[TMP2]], 8388607
+; CHECK-NEXT:    [[TMP8:%.*]] = or i129 [[TMP7]], 8388608
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ult i129 [[TMP6]], 127
+; CHECK-NEXT:    br i1 [[TMP9]], label [[FP_TO_I_CLEANUP1:%.*]], label [[FP_TO_I_IF_END2:%.*]]
+; CHECK:       fp-to-i-if-end2:
+; CHECK-NEXT:    [[TMP10:%.*]] = add i129 [[TMP6]], -256
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ult i129 [[TMP10]], -129
+; CHECK-NEXT:    br i1 [[TMP11]], label [[FP_TO_I_IF_THEN53:%.*]], label [[FP_TO_I_IF_END94:%.*]]
+; CHECK:       fp-to-i-if-then53:
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP3]], i129 340282366920938463463374607431768211455, i129 -340282366920938463463374607431768211456
+; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP1]]
+; CHECK:       fp-to-i-if-end94:
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ult i129 [[TMP6]], 150
+; CHECK-NEXT:    br i1 [[TMP13]], label [[FP_TO_I_IF_THEN125:%.*]], label [[FP_TO_I_IF_ELSE6:%.*]]
+; CHECK:       fp-to-i-if-then125:
+; CHECK-NEXT:    [[TMP14:%.*]] = sub i129 150, [[TMP6]]
+; CHECK-NEXT:    [[TMP15:%.*]] = lshr i129 [[TMP8]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i129 [[TMP15]], [[TMP4]]
+; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP1]]
+; CHECK:       fp-to-i-if-else6:
+; CHECK-NEXT:    [[TMP17:%.*]] = add i129 [[TMP6]], -150
+; CHECK-NEXT:    [[TMP18:%.*]] = shl i129 [[TMP8]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = mul i129 [[TMP18]], [[TMP4]]
+; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP1]]
+; CHECK:       fp-to-i-cleanup1:
+; CHECK-NEXT:    [[TMP20:%.*]] = phi i129 [ [[TMP12]], [[FP_TO_I_IF_THEN53]] ], [ [[TMP16]], [[FP_TO_I_IF_THEN125]] ], [ [[TMP19]], [[FP_TO_I_IF_ELSE6]] ], [ 0, [[FP_TO_I_ENTRYFP_TO_I_ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <2 x i129> poison, i129 [[TMP20]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[A]], i64 1
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast float [[TMP22]] to i32
+; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP23]] to i129
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], -1
+; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i129 1, i129 -1
+; CHECK-NEXT:    [[TMP27:%.*]] = lshr i129 [[TMP24]], 23
+; CHECK-NEXT:    [[TMP28:%.*]] = and i129 [[TMP27]], 255
+; CHECK-NEXT:    [[TMP29:%.*]] = and i129 [[TMP24]], 8388607
+; CHECK-NEXT:    [[TMP30:%.*]] = or i129 [[TMP29]], 8388608
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ult i129 [[TMP28]], 127
+; CHECK-NEXT:    br i1 [[TMP31]], label [[FP_TO_I_CLEANUP:%.*]], label [[FP_TO_I_IF_END:%.*]]
+; CHECK:       fp-to-i-if-end:
+; CHECK-NEXT:    [[TMP32:%.*]] = add i129 [[TMP28]], -256
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp ult i129 [[TMP32]], -129
+; CHECK-NEXT:    br i1 [[TMP33]], label [[FP_TO_I_IF_THEN5:%.*]], label [[FP_TO_I_IF_END9:%.*]]
+; CHECK:       fp-to-i-if-then5:
+; CHECK-NEXT:    [[TMP34:%.*]] = select i1 [[TMP25]], i129 340282366920938463463374607431768211455, i129 -340282366920938463463374607431768211456
+; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
+; CHECK:       fp-to-i-if-end9:
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ult i129 [[TMP28]], 150
+; CHECK-NEXT:    br i1 [[TMP35]], label [[FP_TO_I_IF_THEN12:%.*]], label [[FP_TO_I_IF_ELSE:%.*]]
+; CHECK:       fp-to-i-if-then12:
+; CHECK-NEXT:    [[TMP36:%.*]] = sub i129 150, [[TMP28]]
+; CHECK-NEXT:    [[TMP37:%.*]] = lshr i129 [[TMP30]], [[TMP36]]
+; CHECK-NEXT:    [[TMP38:%.*]] = mul i129 [[TMP37]], [[TMP26]]
+; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
+; CHECK:       fp-to-i-if-else:
+; CHECK-NEXT:    [[TMP39:%.*]] = add i129 [[TMP28]], -150
+; CHECK-NEXT:    [[TMP40:%.*]] = shl i129 [[TMP30]], [[TMP39]]
+; CHECK-NEXT:    [[TMP41:%.*]] = mul i129 [[TMP40]], [[TMP26]]
+; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
+; CHECK:       fp-to-i-cleanup:
+; CHECK-NEXT:    [[TMP42:%.*]] = phi i129 [ [[TMP34]], [[FP_TO_I_IF_THEN5]] ], [ [[TMP38]], [[FP_TO_I_IF_THEN12]] ], [ [[TMP41]], [[FP_TO_I_IF_ELSE]] ], [ 0, [[FP_TO_I_CLEANUP1]] ]
+; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <2 x i129> [[TMP21]], i129 [[TMP42]], i64 1
+; CHECK-NEXT:    ret <2 x i129> [[TMP43]]
+;
+  %conv = fptoui <2 x float> %a to <2 x i129>
+  ret <2 x i129> %conv
+}
diff --git a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-si129tofp.ll b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-si129tofp.ll
index 76f5248..f70ce2f 100644
--- a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-si129tofp.ll
+++ b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-si129tofp.ll
@@ -426,3 +426,166 @@ define fp128 @si129tofp128(i129 %a) {
   %conv = sitofp i129 %a to fp128
   ret fp128 %conv
 }
+
+define <2 x float> @si129tofloatv2(<2 x i129> %a) {
+; CHECK-LABEL: @si129tofloatv2(
+; CHECK-NEXT:  itofp-entryitofp-entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i129> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i129 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[ITOFP_RETURN1:%.*]], label [[ITOFP_IF_END2:%.*]]
+; CHECK:       itofp-if-end2:
+; CHECK-NEXT:    [[TMP2:%.*]] = ashr i129 [[TMP0]], 128
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i129 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i129 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP4]], i1 true)
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i129 [[TMP5]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 129, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = sub i32 128, [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp sgt i32 [[TMP7]], 24
+; CHECK-NEXT:    br i1 [[TMP9]], label [[ITOFP_IF_THEN43:%.*]], label [[ITOFP_IF_ELSE8:%.*]]
+; CHECK:       itofp-if-then43:
+; CHECK-NEXT:    switch i32 [[TMP7]], label [[ITOFP_SW_DEFAULT5:%.*]] [
+; CHECK-NEXT:      i32 25, label [[ITOFP_SW_BB4:%.*]]
+; CHECK-NEXT:      i32 26, label [[ITOFP_SW_EPILOG6:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       itofp-sw-bb4:
+; CHECK-NEXT:    [[TMP10:%.*]] = shl i129 [[TMP4]], 1
+; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG6]]
+; CHECK:       itofp-sw-default5:
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 103, [[TMP6]]
+; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP11]] to i129
+; CHECK-NEXT:    [[TMP13:%.*]] = lshr i129 [[TMP4]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP6]], 26
+; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP14]] to i129
+; CHECK-NEXT:    [[TMP16:%.*]] = lshr i129 -1, [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = and i129 [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne i129 [[TMP17]], 0
+; CHECK-NEXT:    [[TMP19:%.*]] = zext i1 [[TMP18]] to i129
+; CHECK-NEXT:    [[TMP20:%.*]] = or i129 [[TMP13]], [[TMP19]]
+; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG6]]
+; CHECK:       itofp-sw-epilog6:
+; CHECK-NEXT:    [[TMP21:%.*]] = phi i129 [ [[TMP20]], [[ITOFP_SW_DEFAULT5]] ], [ [[TMP4]], [[ITOFP_IF_THEN43]] ], [ [[TMP10]], [[ITOFP_SW_BB4]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = trunc i129 [[TMP21]] to i32
+; CHECK-NEXT:    [[TMP23:%.*]] = lshr i32 [[TMP22]], 2
+; CHECK-NEXT:    [[TMP24:%.*]] = and i32 [[TMP23]], 1
+; CHECK-NEXT:    [[TMP25:%.*]] = zext i32 [[TMP24]] to i129
+; CHECK-NEXT:    [[TMP26:%.*]] = or i129 [[TMP21]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = add i129 [[TMP26]], 1
+; CHECK-NEXT:    [[TMP28:%.*]] = ashr i129 [[TMP27]], 2
+; CHECK-NEXT:    [[A310:%.*]] = and i129 [[TMP27]], 67108864
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i129 [[A310]], 0
+; CHECK-NEXT:    [[TMP30:%.*]] = trunc i129 [[TMP28]] to i32
+; CHECK-NEXT:    [[TMP31:%.*]] = lshr i129 [[TMP28]], 32
+; CHECK-NEXT:    [[TMP32:%.*]] = trunc i129 [[TMP31]] to i32
+; CHECK-NEXT:    br i1 [[TMP29]], label [[ITOFP_IF_END269:%.*]], label [[ITOFP_IF_THEN207:%.*]]
+; CHECK:       itofp-if-then207:
+; CHECK-NEXT:    [[TMP33:%.*]] = ashr i129 [[TMP27]], 3
+; CHECK-NEXT:    [[TMP34:%.*]] = trunc i129 [[TMP33]] to i32
+; CHECK-NEXT:    [[TMP35:%.*]] = lshr i129 [[TMP33]], 32
+; CHECK-NEXT:    [[TMP36:%.*]] = trunc i129 [[TMP35]] to i32
+; CHECK-NEXT:    br label [[ITOFP_IF_END269]]
+; CHECK:       itofp-if-else8:
+; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP6]], -105
+; CHECK-NEXT:    [[TMP38:%.*]] = zext i32 [[TMP37]] to i129
+; CHECK-NEXT:    [[TMP39:%.*]] = shl i129 [[TMP4]], [[TMP38]]
+; CHECK-NEXT:    [[TMP40:%.*]] = trunc i129 [[TMP39]] to i32
+; CHECK-NEXT:    [[TMP41:%.*]] = lshr i129 [[TMP39]], 32
+; CHECK-NEXT:    [[TMP42:%.*]] = trunc i129 [[TMP41]] to i32
+; CHECK-NEXT:    br label [[ITOFP_IF_END269]]
+; CHECK:       itofp-if-end269:
+; CHECK-NEXT:    [[TMP43:%.*]] = phi i32 [ [[TMP34]], [[ITOFP_IF_THEN207]] ], [ [[TMP30]], [[ITOFP_SW_EPILOG6]] ], [ [[TMP40]], [[ITOFP_IF_ELSE8]] ]
+; CHECK-NEXT:    [[TMP44:%.*]] = phi i32 [ [[TMP7]], [[ITOFP_IF_THEN207]] ], [ [[TMP8]], [[ITOFP_SW_EPILOG6]] ], [ [[TMP8]], [[ITOFP_IF_ELSE8]] ]
+; CHECK-NEXT:    [[TMP45:%.*]] = trunc i129 [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP46:%.*]] = and i32 [[TMP45]], -2147483648
+; CHECK-NEXT:    [[TMP47:%.*]] = shl i32 [[TMP44]], 23
+; CHECK-NEXT:    [[TMP48:%.*]] = add i32 [[TMP47]], 1065353216
+; CHECK-NEXT:    [[TMP49:%.*]] = and i32 [[TMP43]], 8388607
+; CHECK-NEXT:    [[TMP50:%.*]] = or i32 [[TMP49]], [[TMP46]]
+; CHECK-NEXT:    [[TMP51:%.*]] = or i32 [[TMP50]], [[TMP48]]
+; CHECK-NEXT:    [[TMP52:%.*]] = bitcast i32 [[TMP51]] to float
+; CHECK-NEXT:    br label [[ITOFP_RETURN1]]
+; CHECK:       itofp-return1:
+; CHECK-NEXT:    [[TMP53:%.*]] = phi float [ [[TMP52]], [[ITOFP_IF_END269]] ], [ 0.000000e+00, [[ITOFP_ENTRYITOFP_ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <2 x float> poison, float [[TMP53]], i64 0
+; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <2 x i129> [[A]], i64 1
+; CHECK-NEXT:    [[TMP56:%.*]] = icmp eq i129 [[TMP55]], 0
+; CHECK-NEXT:    br i1 [[TMP56]], label [[ITOFP_RETURN:%.*]], label [[ITOFP_IF_END:%.*]]
+; CHECK:       itofp-if-end:
+; CHECK-NEXT:    [[TMP57:%.*]] = ashr i129 [[TMP55]], 128
+; CHECK-NEXT:    [[TMP58:%.*]] = xor i129 [[TMP57]], [[TMP55]]
+; CHECK-NEXT:    [[TMP59:%.*]] = sub i129 [[TMP58]], [[TMP57]]
+; CHECK-NEXT:    [[TMP60:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP59]], i1 true)
+; CHECK-NEXT:    [[TMP61:%.*]] = trunc i129 [[TMP60]] to i32
+; CHECK-NEXT:    [[TMP62:%.*]] = sub i32 129, [[TMP61]]
+; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 128, [[TMP61]]
+; CHECK-NEXT:    [[TMP64:%.*]] = icmp sgt i32 [[TMP62]], 24
+; CHECK-NEXT:    br i1 [[TMP64]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
+; CHECK:       itofp-if-then4:
+; CHECK-NEXT:    switch i32 [[TMP62]], label [[ITOFP_SW_DEFAULT:%.*]] [
+; CHECK-NEXT:      i32 25, label [[ITOFP_SW_BB:%.*]]
+; CHECK-NEXT:      i32 26, label [[ITOFP_SW_EPILOG:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       itofp-sw-bb:
+; CHECK-NEXT:    [[TMP65:%.*]] = shl i129 [[TMP59]], 1
+; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG]]
+; CHECK:       itofp-sw-default:
+; CHECK-NEXT:    [[TMP66:%.*]] = sub i32 103, [[TMP61]]
+; CHECK-NEXT:    [[TMP67:%.*]] = zext i32 [[TMP66]] to i129
+; CHECK-NEXT:    [[TMP68:%.*]] = lshr i129 [[TMP59]], [[TMP67]]
+; CHECK-NEXT:    [[TMP69:%.*]] = add i32 [[TMP61]], 26
+; CHECK-NEXT:    [[TMP70:%.*]] = zext i32 [[TMP69]] to i129
+; CHECK-NEXT:    [[TMP71:%.*]] = lshr i129 -1, [[TMP70]]
+; CHECK-NEXT:    [[TMP72:%.*]] = and i129 [[TMP71]], [[TMP59]]
+; CHECK-NEXT:    [[TMP73:%.*]] = icmp ne i129 [[TMP72]], 0
+; CHECK-NEXT:    [[TMP74:%.*]] = zext i1 [[TMP73]] to i129
+; CHECK-NEXT:    [[TMP75:%.*]] = or i129 [[TMP68]], [[TMP74]]
+; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG]]
+; CHECK:       itofp-sw-epilog:
+; CHECK-NEXT:    [[TMP76:%.*]] = phi i129 [ [[TMP75]], [[ITOFP_SW_DEFAULT]] ], [ [[TMP59]], [[ITOFP_IF_THEN4]] ], [ [[TMP65]], [[ITOFP_SW_BB]] ]
+; CHECK-NEXT:    [[TMP77:%.*]] = trunc i129 [[TMP76]] to i32
+; CHECK-NEXT:    [[TMP78:%.*]] = lshr i32 [[TMP77]], 2
+; CHECK-NEXT:    [[TMP79:%.*]] = and i32 [[TMP78]], 1
+; CHECK-NEXT:    [[TMP80:%.*]] = zext i32 [[TMP79]] to i129
+; CHECK-NEXT:    [[TMP81:%.*]] = or i129 [[TMP76]], [[TMP80]]
+; CHECK-NEXT:    [[TMP82:%.*]] = add i129 [[TMP81]], 1
+; CHECK-NEXT:    [[TMP83:%.*]] = ashr i129 [[TMP82]], 2
+; CHECK-NEXT:    [[A3:%.*]] = and i129 [[TMP82]], 67108864
+; CHECK-NEXT:    [[TMP84:%.*]] = icmp eq i129 [[A3]], 0
+; CHECK-NEXT:    [[TMP85:%.*]] = trunc i129 [[TMP83]] to i32
+; CHECK-NEXT:    [[TMP86:%.*]] = lshr i129 [[TMP83]], 32
+; CHECK-NEXT:    [[TMP87:%.*]] = trunc i129 [[TMP86]] to i32
+; CHECK-NEXT:    br i1 [[TMP84]], label [[ITOFP_IF_END26:%.*]], label [[ITOFP_IF_THEN20:%.*]]
+; CHECK:       itofp-if-then20:
+; CHECK-NEXT:    [[TMP88:%.*]] = ashr i129 [[TMP82]], 3
+; CHECK-NEXT:    [[TMP89:%.*]] = trunc i129 [[TMP88]] to i32
+; CHECK-NEXT:    [[TMP90:%.*]] = lshr i129 [[TMP88]], 32
+; CHECK-NEXT:    [[TMP91:%.*]] = trunc i129 [[TMP90]] to i32
+; CHECK-NEXT:    br label [[ITOFP_IF_END26]]
+; CHECK:       itofp-if-else:
+; CHECK-NEXT:    [[TMP92:%.*]] = add i32 [[TMP61]], -105
+; CHECK-NEXT:    [[TMP93:%.*]] = zext i32 [[TMP92]] to i129
+; CHECK-NEXT:    [[TMP94:%.*]] = shl i129 [[TMP59]], [[TMP93]]
+; CHECK-NEXT:    [[TMP95:%.*]] = trunc i129 [[TMP94]] to i32
+; CHECK-NEXT:    [[TMP96:%.*]] = lshr i129 [[TMP94]], 32
+; CHECK-NEXT:    [[TMP97:%.*]] = trunc i129 [[TMP96]] to i32
+; CHECK-NEXT:    br label [[ITOFP_IF_END26]]
+; CHECK:       itofp-if-end26:
+; CHECK-NEXT:    [[TMP98:%.*]] = phi i32 [ [[TMP89]], [[ITOFP_IF_THEN20]] ], [ [[TMP85]], [[ITOFP_SW_EPILOG]] ], [ [[TMP95]], [[ITOFP_IF_ELSE]] ]
+; CHECK-NEXT:    [[TMP99:%.*]] = phi i32 [ [[TMP62]], [[ITOFP_IF_THEN20]] ], [ [[TMP63]], [[ITOFP_SW_EPILOG]] ], [ [[TMP63]], [[ITOFP_IF_ELSE]] ]
+; CHECK-NEXT:    [[TMP100:%.*]] = trunc i129 [[TMP57]] to i32
+; CHECK-NEXT:    [[TMP101:%.*]] = and i32 [[TMP100]], -2147483648
+; CHECK-NEXT:    [[TMP102:%.*]] = shl i32 [[TMP99]], 23
+; CHECK-NEXT:    [[TMP103:%.*]] = add i32 [[TMP102]], 1065353216
+; CHECK-NEXT:    [[TMP104:%.*]] = and i32 [[TMP98]], 8388607
+; CHECK-NEXT:    [[TMP105:%.*]] = or i32 [[TMP104]], [[TMP101]]
+; CHECK-NEXT:    [[TMP106:%.*]] = or i32 [[TMP105]], [[TMP103]]
+; CHECK-NEXT:    [[TMP107:%.*]] = bitcast i32 [[TMP106]] to float
+; CHECK-NEXT:    br label [[ITOFP_RETURN]]
+; CHECK:       itofp-return:
+; CHECK-NEXT:    [[TMP108:%.*]] = phi float [ [[TMP107]], [[ITOFP_IF_END26]] ], [ 0.000000e+00, [[ITOFP_RETURN1]] ]
+; CHECK-NEXT:    [[TMP109:%.*]] = insertelement <2 x float> [[TMP54]], float [[TMP108]], i64 1
+; CHECK-NEXT:    ret <2 x float> [[TMP109]]
+;
+  %conv = sitofp <2 x i129> %a to <2 x float>
+  ret <2 x float> %conv
+}
diff --git a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-ui129tofp.ll b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-ui129tofp.ll
index 96d87a5..ee54d53 100644
--- a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-ui129tofp.ll
+++ b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-ui129tofp.ll
@@ -426,3 +426,166 @@ define fp128 @ui129tofp128(i129 %a) {
   %conv = uitofp i129 %a to fp128
   ret fp128 %conv
 }
+
+define <2 x float> @ui129tofloatv2(<2 x i129> %a) {
+; CHECK-LABEL: @ui129tofloatv2(
+; CHECK-NEXT:  itofp-entryitofp-entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i129> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i129 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[ITOFP_RETURN1:%.*]], label [[ITOFP_IF_END2:%.*]]
+; CHECK:       itofp-if-end2:
+; CHECK-NEXT:    [[TMP2:%.*]] = ashr i129 [[TMP0]], 128
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i129 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i129 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP0]], i1 true)
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i129 [[TMP5]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 129, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = sub i32 128, [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp sgt i32 [[TMP7]], 24
+; CHECK-NEXT:    br i1 [[TMP9]], label [[ITOFP_IF_THEN43:%.*]], label [[ITOFP_IF_ELSE8:%.*]]
+; CHECK:       itofp-if-then43:
+; CHECK-NEXT:    switch i32 [[TMP7]], label [[ITOFP_SW_DEFAULT5:%.*]] [
+; CHECK-NEXT:      i32 25, label [[ITOFP_SW_BB4:%.*]]
+; CHECK-NEXT:      i32 26, label [[ITOFP_SW_EPILOG6:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       itofp-sw-bb4:
+; CHECK-NEXT:    [[TMP10:%.*]] = shl i129 [[TMP0]], 1
+; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG6]]
+; CHECK:       itofp-sw-default5:
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 103, [[TMP6]]
+; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP11]] to i129
+; CHECK-NEXT:    [[TMP13:%.*]] = lshr i129 [[TMP0]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP6]], 26
+; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP14]] to i129
+; CHECK-NEXT:    [[TMP16:%.*]] = lshr i129 -1, [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = and i129 [[TMP16]], [[TMP0]]
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne i129 [[TMP17]], 0
+; CHECK-NEXT:    [[TMP19:%.*]] = zext i1 [[TMP18]] to i129
+; CHECK-NEXT:    [[TMP20:%.*]] = or i129 [[TMP13]], [[TMP19]]
+; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG6]]
+; CHECK:       itofp-sw-epilog6:
+; CHECK-NEXT:    [[TMP21:%.*]] = phi i129 [ [[TMP20]], [[ITOFP_SW_DEFAULT5]] ], [ [[TMP0]], [[ITOFP_IF_THEN43]] ], [ [[TMP10]], [[ITOFP_SW_BB4]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = trunc i129 [[TMP21]] to i32
+; CHECK-NEXT:    [[TMP23:%.*]] = lshr i32 [[TMP22]], 2
+; CHECK-NEXT:    [[TMP24:%.*]] = and i32 [[TMP23]], 1
+; CHECK-NEXT:    [[TMP25:%.*]] = zext i32 [[TMP24]] to i129
+; CHECK-NEXT:    [[TMP26:%.*]] = or i129 [[TMP21]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = add i129 [[TMP26]], 1
+; CHECK-NEXT:    [[TMP28:%.*]] = lshr i129 [[TMP27]], 2
+; CHECK-NEXT:    [[A310:%.*]] = and i129 [[TMP27]], 67108864
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i129 [[A310]], 0
+; CHECK-NEXT:    [[TMP30:%.*]] = trunc i129 [[TMP28]] to i32
+; CHECK-NEXT:    [[TMP31:%.*]] = lshr i129 [[TMP28]], 32
+; CHECK-NEXT:    [[TMP32:%.*]] = trunc i129 [[TMP31]] to i32
+; CHECK-NEXT:    br i1 [[TMP29]], label [[ITOFP_IF_END269:%.*]], label [[ITOFP_IF_THEN207:%.*]]
+; CHECK:       itofp-if-then207:
+; CHECK-NEXT:    [[TMP33:%.*]] = lshr i129 [[TMP27]], 3
+; CHECK-NEXT:    [[TMP34:%.*]] = trunc i129 [[TMP33]] to i32
+; CHECK-NEXT:    [[TMP35:%.*]] = lshr i129 [[TMP33]], 32
+; CHECK-NEXT:    [[TMP36:%.*]] = trunc i129 [[TMP35]] to i32
+; CHECK-NEXT:    br label [[ITOFP_IF_END269]]
+; CHECK:       itofp-if-else8:
+; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP6]], -105
+; CHECK-NEXT:    [[TMP38:%.*]] = zext i32 [[TMP37]] to i129
+; CHECK-NEXT:    [[TMP39:%.*]] = shl i129 [[TMP0]], [[TMP38]]
+; CHECK-NEXT:    [[TMP40:%.*]] = trunc i129 [[TMP39]] to i32
+; CHECK-NEXT:    [[TMP41:%.*]] = lshr i129 [[TMP39]], 32
+; CHECK-NEXT:    [[TMP42:%.*]] = trunc i129 [[TMP41]] to i32
+; CHECK-NEXT:    br label [[ITOFP_IF_END269]]
+; CHECK:       itofp-if-end269:
+; CHECK-NEXT:    [[TMP43:%.*]] = phi i32 [ [[TMP34]], [[ITOFP_IF_THEN207]] ], [ [[TMP30]], [[ITOFP_SW_EPILOG6]] ], [ [[TMP40]], [[ITOFP_IF_ELSE8]] ]
+; CHECK-NEXT:    [[TMP44:%.*]] = phi i32 [ [[TMP7]], [[ITOFP_IF_THEN207]] ], [ [[TMP8]], [[ITOFP_SW_EPILOG6]] ], [ [[TMP8]], [[ITOFP_IF_ELSE8]] ]
+; CHECK-NEXT:    [[TMP45:%.*]] = trunc i129 [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP46:%.*]] = and i32 [[TMP45]], -2147483648
+; CHECK-NEXT:    [[TMP47:%.*]] = shl i32 [[TMP44]], 23
+; CHECK-NEXT:    [[TMP48:%.*]] = add i32 [[TMP47]], 1065353216
+; CHECK-NEXT:    [[TMP49:%.*]] = and i32 [[TMP43]], 8388607
+; CHECK-NEXT:    [[TMP50:%.*]] = or i32 [[TMP49]], [[TMP46]]
+; CHECK-NEXT:    [[TMP51:%.*]] = or i32 [[TMP49]], [[TMP48]]
+; CHECK-NEXT:    [[TMP52:%.*]] = bitcast i32 [[TMP51]] to float
+; CHECK-NEXT:    br label [[ITOFP_RETURN1]]
+; CHECK:       itofp-return1:
+; CHECK-NEXT:    [[TMP53:%.*]] = phi float [ [[TMP52]], [[ITOFP_IF_END269]] ], [ 0.000000e+00, [[ITOFP_ENTRYITOFP_ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <2 x float> poison, float [[TMP53]], i64 0
+; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <2 x i129> [[A]], i64 1
+; CHECK-NEXT:    [[TMP56:%.*]] = icmp eq i129 [[TMP55]], 0
+; CHECK-NEXT:    br i1 [[TMP56]], label [[ITOFP_RETURN:%.*]], label [[ITOFP_IF_END:%.*]]
+; CHECK:       itofp-if-end:
+; CHECK-NEXT:    [[TMP57:%.*]] = ashr i129 [[TMP55]], 128
+; CHECK-NEXT:    [[TMP58:%.*]] = xor i129 [[TMP57]], [[TMP55]]
+; CHECK-NEXT:    [[TMP59:%.*]] = sub i129 [[TMP58]], [[TMP57]]
+; CHECK-NEXT:    [[TMP60:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP55]], i1 true)
+; CHECK-NEXT:    [[TMP61:%.*]] = trunc i129 [[TMP60]] to i32
+; CHECK-NEXT:    [[TMP62:%.*]] = sub i32 129, [[TMP61]]
+; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 128, [[TMP61]]
+; CHECK-NEXT:    [[TMP64:%.*]] = icmp sgt i32 [[TMP62]], 24
+; CHECK-NEXT:    br i1 [[TMP64]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
+; CHECK:       itofp-if-then4:
+; CHECK-NEXT:    switch i32 [[TMP62]], label [[ITOFP_SW_DEFAULT:%.*]] [
+; CHECK-NEXT:      i32 25, label [[ITOFP_SW_BB:%.*]]
+; CHECK-NEXT:      i32 26, label [[ITOFP_SW_EPILOG:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       itofp-sw-bb:
+; CHECK-NEXT:    [[TMP65:%.*]] = shl i129 [[TMP55]], 1
+; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG]]
+; CHECK:       itofp-sw-default:
+; CHECK-NEXT:    [[TMP66:%.*]] = sub i32 103, [[TMP61]]
+; CHECK-NEXT:    [[TMP67:%.*]] = zext i32 [[TMP66]] to i129
+; CHECK-NEXT:    [[TMP68:%.*]] = lshr i129 [[TMP55]], [[TMP67]]
+; CHECK-NEXT:    [[TMP69:%.*]] = add i32 [[TMP61]], 26
+; CHECK-NEXT:    [[TMP70:%.*]] = zext i32 [[TMP69]] to i129
+; CHECK-NEXT:    [[TMP71:%.*]] = lshr i129 -1, [[TMP70]]
+; CHECK-NEXT:    [[TMP72:%.*]] = and i129 [[TMP71]], [[TMP55]]
+; CHECK-NEXT:    [[TMP73:%.*]] = icmp ne i129 [[TMP72]], 0
+; CHECK-NEXT:    [[TMP74:%.*]] = zext i1 [[TMP73]] to i129
+; CHECK-NEXT:    [[TMP75:%.*]] = or i129 [[TMP68]], [[TMP74]]
+; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG]]
+; CHECK:       itofp-sw-epilog:
+; CHECK-NEXT:    [[TMP76:%.*]] = phi i129 [ [[TMP75]], [[ITOFP_SW_DEFAULT]] ], [ [[TMP55]], [[ITOFP_IF_THEN4]] ], [ [[TMP65]], [[ITOFP_SW_BB]] ]
+; CHECK-NEXT:    [[TMP77:%.*]] = trunc i129 [[TMP76]] to i32
+; CHECK-NEXT:    [[TMP78:%.*]] = lshr i32 [[TMP77]], 2
+; CHECK-NEXT:    [[TMP79:%.*]] = and i32 [[TMP78]], 1
+; CHECK-NEXT:    [[TMP80:%.*]] = zext i32 [[TMP79]] to i129
+; CHECK-NEXT:    [[TMP81:%.*]] = or i129 [[TMP76]], [[TMP80]]
+; CHECK-NEXT:    [[TMP82:%.*]] = add i129 [[TMP81]], 1
+; CHECK-NEXT:    [[TMP83:%.*]] = lshr i129 [[TMP82]], 2
+; CHECK-NEXT:    [[A3:%.*]] = and i129 [[TMP82]], 67108864
+; CHECK-NEXT:    [[TMP84:%.*]] = icmp eq i129 [[A3]], 0
+; CHECK-NEXT:    [[TMP85:%.*]] = trunc i129 [[TMP83]] to i32
+; CHECK-NEXT:    [[TMP86:%.*]] = lshr i129 [[TMP83]], 32
+; CHECK-NEXT:    [[TMP87:%.*]] = trunc i129 [[TMP86]] to i32
+; CHECK-NEXT:    br i1 [[TMP84]], label [[ITOFP_IF_END26:%.*]], label [[ITOFP_IF_THEN20:%.*]]
+; CHECK:       itofp-if-then20:
+; CHECK-NEXT:    [[TMP88:%.*]] = lshr i129 [[TMP82]], 3
+; CHECK-NEXT:    [[TMP89:%.*]] = trunc i129 [[TMP88]] to i32
+; CHECK-NEXT:    [[TMP90:%.*]] = lshr i129 [[TMP88]], 32
+; CHECK-NEXT:    [[TMP91:%.*]] = trunc i129 [[TMP90]] to i32
+; CHECK-NEXT:    br label [[ITOFP_IF_END26]]
+; CHECK:       itofp-if-else:
+; CHECK-NEXT:    [[TMP92:%.*]] = add i32 [[TMP61]], -105
+; CHECK-NEXT:    [[TMP93:%.*]] = zext i32 [[TMP92]] to i129
+; CHECK-NEXT:    [[TMP94:%.*]] = shl i129 [[TMP55]], [[TMP93]]
+; CHECK-NEXT:    [[TMP95:%.*]] = trunc i129 [[TMP94]] to i32
+; CHECK-NEXT:    [[TMP96:%.*]] = lshr i129 [[TMP94]], 32
+; CHECK-NEXT:    [[TMP97:%.*]] = trunc i129 [[TMP96]] to i32
+; CHECK-NEXT:    br label [[ITOFP_IF_END26]]
+; CHECK:       itofp-if-end26:
+; CHECK-NEXT:    [[TMP98:%.*]] = phi i32 [ [[TMP89]], [[ITOFP_IF_THEN20]] ], [ [[TMP85]], [[ITOFP_SW_EPILOG]] ], [ [[TMP95]], [[ITOFP_IF_ELSE]] ]
+; CHECK-NEXT:    [[TMP99:%.*]] = phi i32 [ [[TMP62]], [[ITOFP_IF_THEN20]] ], [ [[TMP63]], [[ITOFP_SW_EPILOG]] ], [ [[TMP63]], [[ITOFP_IF_ELSE]] ]
+; CHECK-NEXT:    [[TMP100:%.*]] = trunc i129 [[TMP57]] to i32
+; CHECK-NEXT:    [[TMP101:%.*]] = and i32 [[TMP100]], -2147483648
+; CHECK-NEXT:    [[TMP102:%.*]] = shl i32 [[TMP99]], 23
+; CHECK-NEXT:    [[TMP103:%.*]] = add i32 [[TMP102]], 1065353216
+; CHECK-NEXT:    [[TMP104:%.*]] = and i32 [[TMP98]], 8388607
+; CHECK-NEXT:    [[TMP105:%.*]] = or i32 [[TMP104]], [[TMP101]]
+; CHECK-NEXT:    [[TMP106:%.*]] = or i32 [[TMP104]], [[TMP103]]
+; CHECK-NEXT:    [[TMP107:%.*]] = bitcast i32 [[TMP106]] to float
+; CHECK-NEXT:    br label [[ITOFP_RETURN]]
+; CHECK:       itofp-return:
+; CHECK-NEXT:    [[TMP108:%.*]] = phi float [ [[TMP107]], [[ITOFP_IF_END26]] ], [ 0.000000e+00, [[ITOFP_RETURN1]] ]
+; CHECK-NEXT:    [[TMP109:%.*]] = insertelement <2 x float> [[TMP54]], float [[TMP108]], i64 1
+; CHECK-NEXT:    ret <2 x float> [[TMP109]]
+;
+  %conv = uitofp <2 x i129> %a to <2 x float>
+  ret <2 x float> %conv
+}
diff --git a/llvm/test/Transforms/InstCombine/implies.ll b/llvm/test/Transforms/InstCombine/implies.ll
new file mode 100644
index 0000000..c02d84d
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/implies.ll
@@ -0,0 +1,424 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define i1 @or_implies_sle(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @or_implies_sle(
+; CHECK-NEXT:    [[OR:%.*]] = or i8 [[X:%.*]], 23
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[OR]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %or = or i8 %x, 23
+  %cond = icmp sle i8 %or, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp sle i8 %x, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @or_implies_sle_fail(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @or_implies_sle_fail(
+; CHECK-NEXT:    [[OR:%.*]] = or i8 [[X:%.*]], -34
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[OR]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[R:%.*]] = icmp sle i8 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %or = or i8 %x, -34
+  %cond = icmp sle i8 %or, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp sle i8 %x, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @or_distjoint_implies_ule(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @or_distjoint_implies_ule(
+; CHECK-NEXT:    [[X2:%.*]] = or disjoint i8 [[X:%.*]], 24
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[X2]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %x1 = or disjoint i8 %x, 23
+  %x2 = or disjoint i8 %x, 24
+
+  %cond = icmp ule i8 %x2, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp ule i8 %x1, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @or_distjoint_implies_ule_fail(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @or_distjoint_implies_ule_fail(
+; CHECK-NEXT:    [[X2:%.*]] = or disjoint i8 [[X:%.*]], 24
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[X2]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[X1:%.*]] = or disjoint i8 [[X]], 28
+; CHECK-NEXT:    [[R:%.*]] = icmp ule i8 [[X1]], [[Y]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %x1 = or disjoint i8 %x, 28
+  %x2 = or disjoint i8 %x, 24
+
+  %cond = icmp ule i8 %x2, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp ule i8 %x1, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @or_prove_distjoin_implies_ule(i8 %xx, i8 %y, i1 %other) {
+; CHECK-LABEL: @or_prove_distjoin_implies_ule(
+; CHECK-NEXT:    [[X:%.*]] = and i8 [[XX:%.*]], -16
+; CHECK-NEXT:    [[X2:%.*]] = or disjoint i8 [[X]], 10
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[X2]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %x = and i8 %xx, -16
+  %x1 = or i8 %x, 7
+  %x2 = or i8 %x, 10
+
+  %cond = icmp ule i8 %x2, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp ule i8 %x1, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_or_distjoint_implies_sle(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @src_or_distjoint_implies_sle(
+; CHECK-NEXT:    [[X2:%.*]] = or disjoint i8 [[X:%.*]], 24
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[X2]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %x1 = or disjoint i8 %x, 23
+  %x2 = or disjoint i8 %x, 24
+
+  %cond = icmp sle i8 %x2, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp sle i8 %x1, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_or_distjoint_implies_sle_fail(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @src_or_distjoint_implies_sle_fail(
+; CHECK-NEXT:    [[X2:%.*]] = or disjoint i8 [[X:%.*]], 24
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp slt i8 [[X2]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[X1:%.*]] = or disjoint i8 [[X]], 23
+; CHECK-NEXT:    [[R:%.*]] = icmp sle i8 [[X1]], [[Y]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %x1 = or disjoint i8 %x, 23
+  %x2 = or disjoint i8 %x, 24
+
+  %cond = icmp sle i8 %y, %x2
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp sle i8 %x1, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_addnsw_implies_sle(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @src_addnsw_implies_sle(
+; CHECK-NEXT:    [[X2:%.*]] = add nsw i8 [[X:%.*]], 24
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[X2]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %x1 = add nsw i8 %x, 23
+  %x2 = add nsw i8 %x, 24
+
+  %cond = icmp sle i8 %x2, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp sle i8 %x1, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_addnsw_implies_sle_fail(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @src_addnsw_implies_sle_fail(
+; CHECK-NEXT:    [[X2:%.*]] = add nsw i8 [[X:%.*]], 23
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[X2]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[X1:%.*]] = add nsw i8 [[X]], 24
+; CHECK-NEXT:    [[R:%.*]] = icmp sle i8 [[X1]], [[Y]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %x1 = add nsw i8 %x, 24
+  %x2 = add nsw i8 %x, 23
+
+  %cond = icmp sle i8 %x2, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp sle i8 %x1, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_and_implies_ult(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_and_implies_ult(
+; CHECK-NEXT:    [[COND:%.*]] = icmp ult i8 [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND]], label [[T:%.*]], label [[F:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %cond = icmp ult i8 %x, %z
+  br i1 %cond, label %T, label %F
+T:
+  %and = and i8 %z, %x
+  %r = icmp ult i8 %and, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_and_implies_ult_fail(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_and_implies_ult_fail(
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X]], [[Z]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[AND]], [[Z]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %cond = icmp ule i8 %x, %z
+  br i1 %cond, label %T, label %F
+T:
+  %and = and i8 %x, %z
+  %r = icmp ult i8 %and, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_and_implies_slt_fail(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_and_implies_slt_fail(
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i8 [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND]], label [[T:%.*]], label [[F:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = icmp slt i8 [[AND]], [[Z]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %cond = icmp slt i8 %x, %z
+  br i1 %cond, label %T, label %F
+T:
+  %and = and i8 %x, %y
+  %r = icmp slt i8 %and, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_or_implies_ule(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_or_implies_ule(
+; CHECK-NEXT:    [[OR:%.*]] = or i8 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[OR]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %or = or i8 %y, %x
+  %cond = icmp uge i8 %z, %or
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp ule i8 %x, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_or_implies_false_ugt_todo(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_or_implies_false_ugt_todo(
+; CHECK-NEXT:    [[OR:%.*]] = or i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[COND:%.*]] = icmp ugt i8 [[OR]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND]], label [[T:%.*]], label [[F:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+; CHECK:       F:
+; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[X]], [[Z]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %or = or i8 %x, %y
+  %cond = icmp ugt i8 %or, %z
+  br i1 %cond, label %T, label %F
+T:
+  ret i1 %other
+F:
+  %r = icmp ugt i8 %x, %z
+  ret i1 %r
+
+}
+
+define i1 @src_udiv_implies_ult(i8 %x, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_udiv_implies_ult(
+; CHECK-NEXT:    [[COND:%.*]] = icmp ugt i8 [[Z:%.*]], [[X:%.*]]
+; CHECK-NEXT:    br i1 [[COND]], label [[T:%.*]], label [[F:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %cond = icmp ugt i8 %z, %x
+  br i1 %cond, label %T, label %F
+T:
+  %and = udiv i8 %x, 3
+  %r = icmp ult i8 %and, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_udiv_implies_ult2(i8 %x, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_udiv_implies_ult2(
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[Z:%.*]], [[X:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 true
+;
+  %cond = icmp ule i8 %z, %x
+  br i1 %cond, label %T, label %F
+T:
+  ret i1 %other
+F:
+  %and = udiv i8 %x, 3
+  %r = icmp ult i8 %and, %z
+  ret i1 %r
+}
+
+define i1 @src_smin_implies_sle(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_smin_implies_sle(
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %cond = icmp sle i8 %x, %z
+  br i1 %cond, label %T, label %F
+T:
+  %um = call i8 @llvm.smin.i8(i8 %x, i8 %y)
+  %r = icmp sle i8 %um, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_umin_implies_ule(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_umin_implies_ule(
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %cond = icmp ule i8 %x, %z
+  br i1 %cond, label %T, label %F
+T:
+  %um = call i8 @llvm.umin.i8(i8 %x, i8 %y)
+  %r = icmp ule i8 %um, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_umax_implies_ule(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_umax_implies_ule(
+; CHECK-NEXT:    [[UM:%.*]] = call i8 @llvm.umax.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[UM]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %um = call i8 @llvm.umax.i8(i8 %x, i8 %y)
+  %cond = icmp ule i8 %um, %z
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp ule i8 %x, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_smax_implies_sle(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_smax_implies_sle(
+; CHECK-NEXT:    [[UM:%.*]] = call i8 @llvm.smax.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[UM]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %um = call i8 @llvm.smax.i8(i8 %x, i8 %y)
+  %cond = icmp sle i8 %um, %z
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp sle i8 %x, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
diff --git a/llvm/test/Transforms/InstCombine/known-bits.ll b/llvm/test/Transforms/InstCombine/known-bits.ll
index 5305c78..769f766 100644
--- a/llvm/test/Transforms/InstCombine/known-bits.ll
+++ b/llvm/test/Transforms/InstCombine/known-bits.ll
@@ -124,7 +124,6 @@ exit:
   ret i8 %or2
 }
 
-
 define i8 @test_cond_and_bothways(i8 %x) {
 ; CHECK-LABEL: @test_cond_and_bothways(
 ; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X:%.*]], 91
@@ -181,8 +180,6 @@ exit:
   ret i8 %or2
 }
 
-
-
 define i8 @test_cond_and_commuted(i8 %x, i1 %c1, i1 %c2) {
 ; CHECK-LABEL: @test_cond_and_commuted(
 ; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X:%.*]], 3
@@ -343,7 +340,7 @@ exit:
   ret i8 %or2
 }
 
-define i32 @test_icmp_trunc1(i32 %x){
+define i32 @test_icmp_trunc1(i32 %x) {
 ; CHECK-LABEL: @test_icmp_trunc1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[Y:%.*]] = trunc i32 [[X:%.*]] to i16
@@ -365,7 +362,7 @@ else:
   ret i32 0
 }
 
-define i32 @test_icmp_trunc_assume(i32 %x){
+define i32 @test_icmp_trunc_assume(i32 %x) {
 ; CHECK-LABEL: @test_icmp_trunc_assume(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[Y:%.*]] = trunc i32 [[X:%.*]] to i16
@@ -532,7 +529,106 @@ if.else:
   ret i1 %other
 }
 
+define i8 @and_eq_bits_must_be_set(i8 %x, i8 %y) {
+; CHECK-LABEL: @and_eq_bits_must_be_set(
+; CHECK-NEXT:    [[XY:%.*]] = and i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[XY]], 123
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i8 1
+;
+  %xy = and i8 %x, %y
+  %cmp = icmp eq i8 %xy, 123
+  call void @llvm.assume(i1 %cmp)
+  %r = and i8 %x, 1
+  ret i8 %r
+}
+
+define i8 @and_eq_bits_must_be_set2(i8 %x, i8 %y) {
+; CHECK-LABEL: @and_eq_bits_must_be_set2(
+; CHECK-NEXT:    [[XY:%.*]] = and i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[XY]], 123
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i8 11
+;
+  %xy = and i8 %x, %y
+  %cmp = icmp eq i8 %xy, 123
+  call void @llvm.assume(i1 %cmp)
+  %r = and i8 %y, 11
+  ret i8 %r
+}
+
+define i8 @and_eq_bits_must_be_set2_partial_fail(i8 %x, i8 %y) {
+; CHECK-LABEL: @and_eq_bits_must_be_set2_partial_fail(
+; CHECK-NEXT:    [[XY:%.*]] = and i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[XY]], 123
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[Y]], 111
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %xy = and i8 %x, %y
+  %cmp = icmp eq i8 %xy, 123
+  call void @llvm.assume(i1 %cmp)
+  %r = and i8 %y, 111
+  ret i8 %r
+}
+
+define i8 @or_eq_bits_must_be_unset(i8 %x, i8 %y) {
+; CHECK-LABEL: @or_eq_bits_must_be_unset(
+; CHECK-NEXT:    [[XY:%.*]] = or i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[XY]], 124
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i8 0
+;
+  %xy = or i8 %x, %y
+  %cmp = icmp eq i8 %xy, 124
+  call void @llvm.assume(i1 %cmp)
+  %r = and i8 %x, 3
+  ret i8 %r
+}
+
+define i8 @or_eq_bits_must_be_unset2(i8 %x, i8 %y) {
+; CHECK-LABEL: @or_eq_bits_must_be_unset2(
+; CHECK-NEXT:    [[XY:%.*]] = or i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[XY]], 124
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i8 0
+;
+  %xy = or i8 %x, %y
+  %cmp = icmp eq i8 %xy, 124
+  call void @llvm.assume(i1 %cmp)
+  %r = and i8 %y, 1
+  ret i8 %r
+}
 
+define i8 @or_eq_bits_must_be_unset2_partial_fail(i8 %x, i8 %y) {
+; CHECK-LABEL: @or_eq_bits_must_be_unset2_partial_fail(
+; CHECK-NEXT:    [[XY:%.*]] = or i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[XY]], 124
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[Y]], 4
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %xy = or i8 %x, %y
+  %cmp = icmp eq i8 %xy, 124
+  call void @llvm.assume(i1 %cmp)
+  %r = and i8 %y, 7
+  ret i8 %r
+}
+
+define i8 @or_ne_bits_must_be_unset2_fail(i8 %x, i8 %y) {
+; CHECK-LABEL: @or_ne_bits_must_be_unset2_fail(
+; CHECK-NEXT:    [[XY:%.*]] = or i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i8 [[XY]], 124
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[X]], 3
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %xy = or i8 %x, %y
+  %cmp = icmp ne i8 %xy, 124
+  call void @llvm.assume(i1 %cmp)
+  %r = and i8 %x, 3
+  ret i8 %r
+}
 
 declare void @use(i1)
 declare void @sink(i8)
diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
index 278cabd..05fcf66 100644
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -3693,6 +3693,800 @@ exit:
   ret i32 %rem
 }
 
+; Select icmp and/or/xor
+; https://alive2.llvm.org/ce/z/QXQDwF
+; X&Y==C?X|Y:X^Y, X&Y==C?X^Y:X|Y
+; TODO: X&Y==0 could imply no_common_bit to TrueValue
+define i32 @src_and_eq_0_or_xor(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_and_eq_0_or_xor(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[OR]], i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+entry:
+  %and = and i32 %y, %x
+  %cmp = icmp eq i32 %and, 0
+  %or = or i32 %y, %x
+  %xor = xor i32 %y, %x
+  %cond = select i1 %cmp, i32 %or, i32 %xor
+  ret i32 %cond
+}
+
+; TODO: X&Y==0 could imply no_common_bit to TrueValue
+define i32 @src_and_eq_0_xor_or(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_and_eq_0_xor_or(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[XOR]], i32 [[OR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+entry:
+  %and = and i32 %y, %x
+  %cmp = icmp eq i32 %and, 0
+  %xor = xor i32 %y, %x
+  %or = or i32 %y, %x
+  %cond = select i1 %cmp, i32 %xor, i32 %or
+  ret i32 %cond
+}
+
+; TODO: X&Y==-1 could imply all_common_bit to TrueValue
+define i32 @src_and_eq_neg1_or_xor(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_and_eq_neg1_or_xor(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], -1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[OR]], i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+entry:
+  %and = and i32 %y, %x
+  %cmp = icmp eq i32 %and, -1
+  %or = or i32 %y, %x
+  %xor = xor i32 %y, %x
+  %cond = select i1 %cmp, i32 %or, i32 %xor
+  ret i32 %cond
+}
+
+; TODO: X&Y==-1 could imply all_common_bit to TrueValue
+define i32 @src_and_eq_neg1_xor_or(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_and_eq_neg1_xor_or(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], -1
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[XOR]], i32 [[OR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+entry:
+  %and = and i32 %y, %x
+  %cmp = icmp eq i32 %and, -1
+  %xor = xor i32 %y, %x
+  %or = or i32 %y, %x
+  %cond = select i1 %cmp, i32 %xor, i32 %or
+  ret i32 %cond
+}
+
+define i32 @src_and_eq_C_or_xororC(i32 %x, i32 %y, i32 %c) {
+; CHECK-LABEL: @src_and_eq_C_or_xororC(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[OR1:%.*]] = or i32 [[XOR]], [[C:%.*]]
+; CHECK-NEXT:    ret i32 [[OR1]]
+;
+entry:
+  %and = and i32 %y, %x
+  %cmp = icmp eq i32 %and, %c
+  %or = or i32 %y, %x
+  %xor = xor i32 %y, %x
+  %or1 = or i32 %xor, %c
+  %cond = select i1 %cmp, i32 %or, i32 %or1
+  ret i32 %cond
+}
+
+define i32 @src_and_eq_C_or_xorxorC(i32 %x, i32 %y, i32 %c) {
+; CHECK-LABEL: @src_and_eq_C_or_xorxorC(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i32 [[XOR]], [[C:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR1]]
+;
+entry:
+  %and = and i32 %y, %x
+  %cmp = icmp eq i32 %and, %c
+  %or = or i32 %y, %x
+  %xor = xor i32 %y, %x
+  %xor1 = xor i32 %xor, %c
+  %cond = select i1 %cmp, i32 %or, i32 %xor1
+  ret i32 %cond
+}
+
+define i32 @src_and_eq_C_xor_OrAndNotC(i32 %x, i32 %y, i32 %c) {
+; CHECK-LABEL: @src_and_eq_C_xor_OrAndNotC(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[NOT:%.*]] = xor i32 [[C:%.*]], -1
+; CHECK-NEXT:    [[AND1:%.*]] = and i32 [[OR]], [[NOT]]
+; CHECK-NEXT:    ret i32 [[AND1]]
+;
+entry:
+  %and = and i32 %y, %x
+  %cmp = icmp eq i32 %and, %c
+  %xor = xor i32 %y, %x
+  %or = or i32 %y, %x
+  %not = xor i32 %c, -1
+  %and1 = and i32 %or, %not
+  %cond = select i1 %cmp, i32 %xor, i32 %and1
+  ret i32 %cond
+}
+
+define i32 @src_and_eq_C_xor_orxorC(i32 %x, i32 %y, i32 %c) {
+; CHECK-LABEL: @src_and_eq_C_xor_orxorC(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i32 [[OR]], [[C:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR1]]
+;
+entry:
+  %and = and i32 %y, %x
+  %cmp = icmp eq i32 %and, %c
+  %xor = xor i32 %y, %x
+  %or = or i32 %y, %x
+  %xor1 = xor i32 %or, %c
+  %cond = select i1 %cmp, i32 %xor, i32 %xor1
+  ret i32 %cond
+}
+
+; https://alive2.llvm.org/ce/z/9RPwfN
+; X|Y==C?X&Y:X^Y, X|Y==C?X^Y:X&Y
+; TODO: X|Y==0 could imply no_common_bit to TrueValue
+define i32 @src_or_eq_0_and_xor(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_or_eq_0_and_xor(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[OR]], 0
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[AND]], i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+entry:
+  %or = or i32 %y, %x
+  %cmp = icmp eq i32 %or, 0
+  %and = and i32 %y, %x
+  %xor = xor i32 %y, %x
+  %cond = select i1 %cmp, i32 %and, i32 %xor
+  ret i32 %cond
+}
+
+; TODO: X|Y==0 could imply no_common_bit to TrueValue
+define i32 @src_or_eq_0_xor_and(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_or_eq_0_xor_and(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[OR]], 0
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[XOR]], i32 [[AND]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+entry:
+  %or = or i32 %y, %x
+  %cmp = icmp eq i32 %or, 0
+  %xor = xor i32 %y, %x
+  %and = and i32 %y, %x
+  %cond = select i1 %cmp, i32 %xor, i32 %and
+  ret i32 %cond
+}
+
+define i32 @src_or_eq_neg1_and_xor(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_or_eq_neg1_and_xor(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[NOT:%.*]] = xor i32 [[TMP0]], -1
+; CHECK-NEXT:    ret i32 [[NOT]]
+;
+entry:
+  %or = or i32 %y, %x
+  %cmp = icmp eq i32 %or, -1
+  %and = and i32 %y, %x
+  %0 = xor i32 %x, %y
+  %not = xor i32 %0, -1
+  %cond = select i1 %cmp, i32 %and, i32 %not
+  ret i32 %cond
+}
+
+define i32 @src_or_eq_neg1_xor_and(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_or_eq_neg1_xor_and(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[NOT:%.*]] = xor i32 [[AND]], -1
+; CHECK-NEXT:    ret i32 [[NOT]]
+;
+entry:
+  %or = or i32 %y, %x
+  %cmp = icmp eq i32 %or, -1
+  %xor = xor i32 %y, %x
+  %and = and i32 %y, %x
+  %not = xor i32 %and, -1
+  %cond = select i1 %cmp, i32 %xor, i32 %not
+  ret i32 %cond
+}
+
+define i32 @src_or_eq_C_and_xorC(i32 %x, i32 %y, i32 %c) {
+; CHECK-LABEL: @src_or_eq_C_and_xorC(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i32 [[XOR]], [[C:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR1]]
+;
+entry:
+  %or = or i32 %y, %x
+  %cmp = icmp eq i32 %or, %c
+  %and = and i32 %y, %x
+  %xor = xor i32 %y, %x
+  %xor1 = xor i32 %xor, %c
+  %cond = select i1 %cmp, i32 %and, i32 %xor1
+  ret i32 %cond
+}
+
+define i32 @src_or_eq_C_and_andnotxorC(i32 %x, i32 %y, i32 %c) {
+; CHECK-LABEL: @src_or_eq_C_and_andnotxorC(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[NOT:%.*]] = xor i32 [[TMP0]], -1
+; CHECK-NEXT:    [[AND1:%.*]] = and i32 [[NOT]], [[C:%.*]]
+; CHECK-NEXT:    ret i32 [[AND1]]
+;
+entry:
+  %or = or i32 %y, %x
+  %cmp = icmp eq i32 %or, %c
+  %and = and i32 %y, %x
+  %0 = xor i32 %x, %y
+  %not = xor i32 %0, -1
+  %and1 = and i32 %not, %c
+  %cond = select i1 %cmp, i32 %and, i32 %and1
+  ret i32 %cond
+}
+
+define i32 @src_or_eq_C_xor_xorandC(i32 %x, i32 %y, i32 %c) {
+; CHECK-LABEL: @src_or_eq_C_xor_xorandC(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i32 [[AND]], [[C:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR1]]
+;
+entry:
+  %or = or i32 %y, %x
+  %cmp = icmp eq i32 %or, %c
+  %xor = xor i32 %y, %x
+  %and = and i32 %y, %x
+  %xor1 = xor i32 %and, %c
+  %cond = select i1 %cmp, i32 %xor, i32 %xor1
+  ret i32 %cond
+}
+
+define i32 @src_or_eq_C_xor_andnotandC(i32 %x, i32 %y, i32 %c) {
+; CHECK-LABEL: @src_or_eq_C_xor_andnotandC(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[NOT:%.*]] = xor i32 [[AND]], -1
+; CHECK-NEXT:    [[AND1:%.*]] = and i32 [[NOT]], [[C:%.*]]
+; CHECK-NEXT:    ret i32 [[AND1]]
+;
+entry:
+  %or = or i32 %y, %x
+  %cmp = icmp eq i32 %or, %c
+  %xor = xor i32 %y, %x
+  %and = and i32 %y, %x
+  %not = xor i32 %and, -1
+  %and1 = and i32 %not, %c
+  %cond = select i1 %cmp, i32 %xor, i32 %and1
+  ret i32 %cond
+}
+
+; https://alive2.llvm.org/ce/z/c6oXi4
+; X^Y==C?X&Y:X|Y, X^Y==C?X|Y:X&Y
+define i32 @src_xor_eq_neg1_and(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_xor_eq_neg1_and(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[NOT:%.*]] = xor i32 [[OR]], -1
+; CHECK-NEXT:    ret i32 [[NOT]]
+;
+entry:
+  %xor = xor i32 %y, %x
+  %cmp = icmp eq i32 %xor, -1
+  %and = and i32 %y, %x
+  %or = or i32 %y, %x
+  %not = xor i32 %or, -1
+  %cond = select i1 %cmp, i32 %and, i32 %not
+  ret i32 %cond
+}
+
+; TODO: X^Y==-1 could imply no_common_bit to TrueValue
+define i32 @src_xor_eq_neg1_or(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_xor_eq_neg1_or(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[XOR]], -1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[OR]], i32 -1
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+entry:
+  %xor = xor i32 %y, %x
+  %cmp = icmp eq i32 %xor, -1
+  %or = or i32 %y, %x
+  %cond = select i1 %cmp, i32 %or, i32 -1
+  ret i32 %cond
+}
+
+define i32 @src_xor_eq_C_and_xororC(i32 %x, i32 %y, i32 %c) {
+; CHECK-LABEL: @src_xor_eq_C_and_xororC(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i32 [[OR]], [[C:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR1]]
+;
+entry:
+  %xor = xor i32 %y, %x
+  %cmp = icmp eq i32 %xor, %c
+  %and = and i32 %y, %x
+  %or = or i32 %y, %x
+  %xor1 = xor i32 %or, %c
+  %cond = select i1 %cmp, i32 %and, i32 %xor1
+  ret i32 %cond
+}
+
+define i32 @src_xor_eq_C_and_andornotC(i32 %x, i32 %y, i32 %c) {
+; CHECK-LABEL: @src_xor_eq_C_and_andornotC(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[NOT:%.*]] = xor i32 [[C:%.*]], -1
+; CHECK-NEXT:    [[AND1:%.*]] = and i32 [[OR]], [[NOT]]
+; CHECK-NEXT:    ret i32 [[AND1]]
+;
+entry:
+  %xor = xor i32 %y, %x
+  %cmp = icmp eq i32 %xor, %c
+  %and = and i32 %y, %x
+  %or = or i32 %y, %x
+  %not = xor i32 %c, -1
+  %and1 = and i32 %or, %not
+  %cond = select i1 %cmp, i32 %and, i32 %and1
+  ret i32 %cond
+}
+
+define i32 @src_xor_eq_C_or_xorandC(i32 %x, i32 %y, i32 %c) {
+; CHECK-LABEL: @src_xor_eq_C_or_xorandC(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i32 [[AND]], [[C:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR1]]
+;
+entry:
+  %xor = xor i32 %y, %x
+  %cmp = icmp eq i32 %xor, %c
+  %or = or i32 %y, %x
+  %and = and i32 %y, %x
+  %xor1 = xor i32 %and, %c
+  %cond = select i1 %cmp, i32 %or, i32 %xor1
+  ret i32 %cond
+}
+
+define i32 @src_xor_eq_C_or_orandC(i32 %x, i32 %y, i32 %c) {
+; CHECK-LABEL: @src_xor_eq_C_or_orandC(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[OR1:%.*]] = or i32 [[AND]], [[C:%.*]]
+; CHECK-NEXT:    ret i32 [[OR1]]
+;
+entry:
+  %xor = xor i32 %y, %x
+  %cmp = icmp eq i32 %xor, %c
+  %or = or i32 %y, %x
+  %and = and i32 %y, %x
+  %or1 = or i32 %and, %c
+  %cond = select i1 %cmp, i32 %or, i32 %or1
+  ret i32 %cond
+}
+
+; Select icmp and/or/xor
+; NO TRANSFORMED - select condition is compare with not 0
+define i32 @src_select_and_min_positive_int(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_select_and_min_positive_int(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[AND0:%.*]] = icmp eq i32 [[AND]], 1
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[AND0]], i32 [[OR]], i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %and = and i32 %x, %y
+  %and0 = icmp eq i32 %and, 1
+  %xor = xor i32 %x, %y
+  %or = or i32 %x, %y
+  %cond = select i1 %and0, i32 %or, i32 %xor
+  ret i32 %cond
+}
+
+define i32 @src_select_and_max_positive_int(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_select_and_max_positive_int(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[AND0:%.*]] = icmp eq i32 [[AND]], 2147483647
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[AND0]], i32 [[OR]], i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %and = and i32 %x, %y
+  %and0 = icmp eq i32 %and, 2147483647
+  %xor = xor i32 %x, %y
+  %or = or i32 %x, %y
+  %cond = select i1 %and0, i32 %or, i32 %xor
+  ret i32 %cond
+}
+
+define i32 @src_select_and_min_negative_int(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_select_and_min_negative_int(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[AND0:%.*]] = icmp eq i32 [[AND]], -2147483648
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[AND0]], i32 [[OR]], i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %and = and i32 %x, %y
+  %and0 = icmp eq i32 %and, -2147483648
+  %xor = xor i32 %x, %y
+  %or = or i32 %x, %y
+  %cond = select i1 %and0, i32 %or, i32 %xor
+  ret i32 %cond
+}
+
+define i32 @src_select_or_min_positive_int(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_select_or_min_positive_int(
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OR0:%.*]] = icmp eq i32 [[OR]], 1
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0]], i32 [[AND]], i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %or = or i32 %x, %y
+  %or0 = icmp eq i32 %or, 1
+  %and = and i32 %x, %y
+  %xor = xor i32 %x, %y
+  %cond = select i1 %or0, i32 %and, i32 %xor
+  ret i32 %cond
+}
+
+define i32 @src_select_or_max_positive_int(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_select_or_max_positive_int(
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OR0:%.*]] = icmp eq i32 [[OR]], 2147483647
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0]], i32 [[AND]], i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %or = or i32 %x, %y
+  %or0 = icmp eq i32 %or, 2147483647
+  %and = and i32 %x, %y
+  %xor = xor i32 %x, %y
+  %cond = select i1 %or0, i32 %and, i32 %xor
+  ret i32 %cond
+}
+
+define i32 @src_select_or_min_negative_int(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_select_or_min_negative_int(
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OR0:%.*]] = icmp eq i32 [[OR]], -2147483648
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0]], i32 [[AND]], i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %or = or i32 %x, %y
+  %or0 = icmp eq i32 %or, -2147483648
+  %and = and i32 %x, %y
+  %xor = xor i32 %x, %y
+  %cond = select i1 %or0, i32 %and, i32 %xor
+  ret i32 %cond
+}
+
+define i32 @src_select_or_max_negative_int(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_select_or_max_negative_int(
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OR0:%.*]] = icmp eq i32 [[OR]], -1
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0]], i32 [[AND]], i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %or = or i32 %x, %y
+  %or0 = icmp eq i32 %or, -1
+  %and = and i32 %x, %y
+  %xor = xor i32 %x, %y
+  %cond = select i1 %or0, i32 %and, i32 %xor
+  ret i32 %cond
+}
+
+define i32 @src_select_xor_min_positive_int(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_select_xor_min_positive_int(
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[XOR0:%.*]] = icmp eq i32 [[XOR]], 1
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[XOR0]], i32 [[AND]], i32 [[OR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %xor = xor i32 %x, %y
+  %xor0 = icmp eq i32 %xor, 1
+  %and = and i32 %x, %y
+  %or = or i32 %x, %y
+  %cond = select i1 %xor0, i32 %and, i32 %or
+  ret i32 %cond
+}
+
+define i32 @src_select_xor_max_positive_int(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_select_xor_max_positive_int(
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[XOR0:%.*]] = icmp eq i32 [[XOR]], 2147483647
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[XOR0]], i32 [[AND]], i32 [[OR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %xor = xor i32 %x, %y
+  %xor0 = icmp eq i32 %xor, 2147483647
+  %and = and i32 %x, %y
+  %or = or i32 %x, %y
+  %cond = select i1 %xor0, i32 %and, i32 %or
+  ret i32 %cond
+}
+
+define i32 @src_select_xor_min_negative_int(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_select_xor_min_negative_int(
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[XOR0:%.*]] = icmp eq i32 [[XOR]], -2147483648
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[XOR0]], i32 [[AND]], i32 [[OR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %xor = xor i32 %x, %y
+  %xor0 = icmp eq i32 %xor, -2147483648
+  %and = and i32 %x, %y
+  %or = or i32 %x, %y
+  %cond = select i1 %xor0, i32 %and, i32 %or
+  ret i32 %cond
+}
+
+define i32 @src_select_xor_max_negative_int(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_select_xor_max_negative_int(
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[XOR0:%.*]] = icmp eq i32 [[XOR]], -1
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[XOR0]], i32 [[AND]], i32 [[OR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %xor = xor i32 %x, %y
+  %xor0 = icmp eq i32 %xor, -1
+  %and = and i32 %x, %y
+  %or = or i32 %x, %y
+  %cond = select i1 %xor0, i32 %and, i32 %or
+  ret i32 %cond
+}
+
+; Select icmp and/or/xor
+; https://alive2.llvm.org/ce/z/BVgrJ-
+; NO TRANSFORMED - not supported
+define i32 @src_no_trans_select_and_eq0_and_or(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_and_eq0_and_or(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[AND0:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[AND0]], i32 0, i32 [[OR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %and = and i32 %x, %y
+  %and0 = icmp eq i32 %and, 0
+  %or = or i32 %x, %y
+  %cond = select i1 %and0, i32 %and, i32 %or
+  ret i32 %cond
+}
+
+define i32 @src_no_trans_select_and_eq0_and_xor(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_and_eq0_and_xor(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[AND0:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[AND0]], i32 0, i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %and = and i32 %x, %y
+  %and0 = icmp eq i32 %and, 0
+  %xor = xor i32 %x, %y
+  %cond = select i1 %and0, i32 %and, i32 %xor
+  ret i32 %cond
+}
+
+define i32 @src_no_trans_select_and_eq0_or_and(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_and_eq0_or_and(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[AND0:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[AND0]], i32 [[OR]], i32 [[AND]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %and = and i32 %x, %y
+  %and0 = icmp eq i32 %and, 0
+  %or = or i32 %x, %y
+  %cond = select i1 %and0, i32 %or, i32 %and
+  ret i32 %cond
+}
+
+define i32 @src_no_trans_select_and_eq0_xor_and(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_and_eq0_xor_and(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[AND0:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[AND0]], i32 [[XOR]], i32 [[AND]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %and = and i32 %x, %y
+  %and0 = icmp eq i32 %and, 0
+  %xor = xor i32 %x, %y
+  %cond = select i1 %and0, i32 %xor, i32 %and
+  ret i32 %cond
+}
+
+define i32 @src_no_trans_select_or_eq0_or_and(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_or_eq0_or_and(
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OR0:%.*]] = icmp eq i32 [[OR]], 0
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0]], i32 0, i32 [[AND]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %or = or i32 %x, %y
+  %or0 = icmp eq i32 %or, 0
+  %and = and i32 %x, %y
+  %cond = select i1 %or0, i32 %or, i32 %and
+  ret i32 %cond
+}
+
+define i32 @src_no_trans_select_or_eq0_or_xor(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_or_eq0_or_xor(
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OR0:%.*]] = icmp eq i32 [[OR]], 0
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0]], i32 0, i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %or = or i32 %x, %y
+  %or0 = icmp eq i32 %or, 0
+  %xor = xor i32 %x, %y
+  %cond = select i1 %or0, i32 %or, i32 %xor
+  ret i32 %cond
+}
+
+define i32 @src_no_trans_select_or_eq0_and_or(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_or_eq0_and_or(
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OR0:%.*]] = icmp eq i32 [[OR]], 0
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0]], i32 [[AND]], i32 [[OR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %or = or i32 %x, %y
+  %or0 = icmp eq i32 %or, 0
+  %and = and i32 %x, %y
+  %cond = select i1 %or0, i32 %and, i32 %or
+  ret i32 %cond
+}
+
+define i32 @src_no_trans_select_or_eq0_xor_or(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_or_eq0_xor_or(
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OR0:%.*]] = icmp eq i32 [[OR]], 0
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0]], i32 [[XOR]], i32 [[OR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %or = or i32 %x, %y
+  %or0 = icmp eq i32 %or, 0
+  %xor = xor i32 %x, %y
+  %cond = select i1 %or0, i32 %xor, i32 %or
+  ret i32 %cond
+}
+
+define i32 @src_no_trans_select_and_ne0_xor_or(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_and_ne0_xor_or(
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OR0_NOT:%.*]] = icmp eq i32 [[OR]], 0
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0_NOT]], i32 0, i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %or = or i32 %x, %y
+  %or0 = icmp ne i32 %or, 0
+  %xor = xor i32 %x, %y
+  %cond = select i1 %or0, i32 %xor, i32 %or
+  ret i32 %cond
+}
+
+define i32 @src_no_trans_select_xor_eq0_xor_and(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_xor_eq0_xor_and(
+; CHECK-NEXT:    [[XOR0:%.*]] = icmp eq i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[XOR0]], i32 0, i32 [[AND]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %xor = xor i32 %x, %y
+  %xor0 = icmp eq i32 %xor, 0
+  %and = and i32 %x, %y
+  %cond = select i1 %xor0, i32 %xor, i32 %and
+  ret i32 %cond
+}
+
+define i32 @src_no_trans_select_xor_eq0_xor_or(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_xor_eq0_xor_or(
+; CHECK-NEXT:    [[XOR0:%.*]] = icmp eq i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[XOR0]], i32 0, i32 [[OR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %xor = xor i32 %x, %y
+  %xor0 = icmp eq i32 %xor, 0
+  %or = or i32 %x, %y
+  %cond = select i1 %xor0, i32 %xor, i32 %or
+  ret i32 %cond
+}
+
+define i32 @src_no_trans_select_xor_eq0_and_xor(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_xor_eq0_and_xor(
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[XOR0:%.*]] = icmp eq i32 [[XOR]], 0
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[XOR0]], i32 [[AND]], i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %xor = xor i32 %x, %y
+  %xor0 = icmp eq i32 %xor, 0
+  %and = and i32 %x, %y
+  %cond = select i1 %xor0, i32 %and, i32 %xor
+  ret i32 %cond
+}
+
+; https://alive2.llvm.org/ce/z/SBe8ei
+define i32 @src_no_trans_select_xor_eq0_or_xor(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_xor_eq0_or_xor(
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[XOR0:%.*]] = icmp eq i32 [[XOR]], 0
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[XOR0]], i32 [[OR]], i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %xor = xor i32 %x, %y
+  %xor0 = icmp eq i32 %xor, 0
+  %or = or i32 %x, %y
+  %cond = select i1 %xor0, i32 %or, i32 %xor
+  ret i32 %cond
+}
+
 ; (X == C) ? X : Y -> (X == C) ? C : Y
 ; Fixed #77553
 define i32 @src_select_xxory_eq0_xorxy_y(i32 %x, i32 %y) {
diff --git a/llvm/test/Transforms/InstCombine/zext-or-icmp.ll b/llvm/test/Transforms/InstCombine/zext-or-icmp.ll
index 661c360..a4b74aa 100644
--- a/llvm/test/Transforms/InstCombine/zext-or-icmp.ll
+++ b/llvm/test/Transforms/InstCombine/zext-or-icmp.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+; RUN: opt < %s -passes='instcombine<no-verify-fixpoint>' -S | FileCheck %s
 
 define i8 @zext_or_icmp_icmp(i8 %a, i8 %b) {
 ; CHECK-LABEL: @zext_or_icmp_icmp(
@@ -180,11 +180,11 @@ define i8 @PR49475_infloop(i32 %t0, i16 %insert, i64 %e, i8 %i162) {
 ; CHECK-NEXT:    [[SEXT:%.*]] = shl i64 [[SUB17]], 32
 ; CHECK-NEXT:    [[CONV18:%.*]] = ashr exact i64 [[SEXT]], 32
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i64 [[XOR]], [[CONV18]]
-; CHECK-NEXT:    [[CONV19:%.*]] = zext i1 [[CMP]] to i16
-; CHECK-NEXT:    [[OR21:%.*]] = or i16 [[CONV19]], [[INSERT]]
-; CHECK-NEXT:    [[TOBOOL23_NOT:%.*]] = icmp eq i16 [[OR21]], 0
+; CHECK-NEXT:    [[TRUNC44:%.*]] = zext i1 [[CMP]] to i8
+; CHECK-NEXT:    [[INC:%.*]] = add i8 [[TRUNC44]], [[I162]]
+; CHECK-NEXT:    [[TOBOOL23_NOT:%.*]] = xor i1 [[CMP]], true
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[TOBOOL23_NOT]])
-; CHECK-NEXT:    ret i8 [[I162]]
+; CHECK-NEXT:    ret i8 [[INC]]
 ;
   %b = icmp eq i32 %t0, 0
   %b2 = icmp eq i16 %insert, 0
diff --git a/llvm/test/Transforms/InstSimplify/implies.ll b/llvm/test/Transforms/InstSimplify/implies.ll
index b70dc90..7e3cb65 100644
--- a/llvm/test/Transforms/InstSimplify/implies.ll
+++ b/llvm/test/Transforms/InstSimplify/implies.ll
@@ -155,7 +155,13 @@ define i1 @test9(i32 %length.i, i32 %i) {
 
 define i1 @test10(i32 %length.i, i32 %x.full) {
 ; CHECK-LABEL: @test10(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[X:%.*]] = and i32 [[X_FULL:%.*]], -65536
+; CHECK-NEXT:    [[LARGE:%.*]] = or i32 [[X]], 100
+; CHECK-NEXT:    [[SMALL:%.*]] = or i32 [[X]], 90
+; CHECK-NEXT:    [[KNOWN:%.*]] = icmp ult i32 [[LARGE]], [[LENGTH_I:%.*]]
+; CHECK-NEXT:    [[TO_PROVE:%.*]] = icmp ult i32 [[SMALL]], [[LENGTH_I]]
+; CHECK-NEXT:    [[RES:%.*]] = icmp ule i1 [[KNOWN]], [[TO_PROVE]]
+; CHECK-NEXT:    ret i1 [[RES]]
 ;
   %x = and i32 %x.full, 4294901760  ;; 4294901760 == 0xffff0000
   %large = or i32 %x, 100
@@ -166,6 +172,19 @@ define i1 @test10(i32 %length.i, i32 %x.full) {
   ret i1 %res
 }
 
+define i1 @test10_with_disjoint(i32 %length.i, i32 %x.full) {
+; CHECK-LABEL: @test10_with_disjoint(
+; CHECK-NEXT:    ret i1 true
+;
+  %x = and i32 %x.full, 4294901760  ;; 4294901760 == 0xffff0000
+  %large = or disjoint i32 %x, 100
+  %small = or disjoint i32 %x, 90
+  %known = icmp ult i32 %large, %length.i
+  %to.prove = icmp ult i32 %small, %length.i
+  %res = icmp ule i1 %known, %to.prove
+  ret i1 %res
+}
+
 define i1 @test11(i32 %length.i, i32 %x) {
 ; CHECK-LABEL: @test11(
 ; CHECK-NEXT:    [[LARGE:%.*]] = or i32 [[X:%.*]], 100
@@ -216,7 +235,13 @@ define i1 @test13(i32 %length.i, i32 %x) {
 
 define i1 @test14(i32 %length.i, i32 %x.full) {
 ; CHECK-LABEL: @test14(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[X:%.*]] = and i32 [[X_FULL:%.*]], -61681
+; CHECK-NEXT:    [[LARGE:%.*]] = or i32 [[X]], 8224
+; CHECK-NEXT:    [[SMALL:%.*]] = or i32 [[X]], 4112
+; CHECK-NEXT:    [[KNOWN:%.*]] = icmp ult i32 [[LARGE]], [[LENGTH_I:%.*]]
+; CHECK-NEXT:    [[TO_PROVE:%.*]] = icmp ult i32 [[SMALL]], [[LENGTH_I]]
+; CHECK-NEXT:    [[RES:%.*]] = icmp ule i1 [[KNOWN]], [[TO_PROVE]]
+; CHECK-NEXT:    ret i1 [[RES]]
 ;
   %x = and i32 %x.full, 4294905615  ;; 4294905615 == 0xffff0f0f
   %large = or i32 %x, 8224 ;; == 0x2020
@@ -227,6 +252,19 @@ define i1 @test14(i32 %length.i, i32 %x.full) {
   ret i1 %res
 }
 
+define i1 @test14_with_disjoint(i32 %length.i, i32 %x.full) {
+; CHECK-LABEL: @test14_with_disjoint(
+; CHECK-NEXT:    ret i1 true
+;
+  %x = and i32 %x.full, 4294905615  ;; 4294905615 == 0xffff0f0f
+  %large = or disjoint i32 %x, 8224 ;; == 0x2020
+  %small = or disjoint i32 %x, 4112 ;; == 0x1010
+  %known = icmp ult i32 %large, %length.i
+  %to.prove = icmp ult i32 %small, %length.i
+  %res = icmp ule i1 %known, %to.prove
+  ret i1 %res
+}
+
 define i1 @test15(i32 %length.i, i32 %x) {
 ; CHECK-LABEL: @test15(
 ; CHECK-NEXT:    [[LARGE:%.*]] = add nuw i32 [[X:%.*]], 100
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
index 3e895edc..afd49aa 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
@@ -43,9 +43,8 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP16]], ptr [[TMP17]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 8)
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP18:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 8)
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
@@ -135,9 +134,8 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP16]], ptr [[TMP17]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP18:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-force-tail-with-evl.ll
new file mode 100644
index 0000000..2ce2a45
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-force-tail-with-evl.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: -mcpu=pwr10 -S < %s | FileCheck %s
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: -mcpu=pwr10 -S < %s | FileCheck %s
+
+define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll
new file mode 100644
index 0000000..5d1a471
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll
@@ -0,0 +1,117 @@
+; REQUIRES: asserts
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: -mcpu=pwr10 -disable-output < %s 2>&1 | FileCheck %s
+
+define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; CHECK-LABEL: VPlan 'Initial VPlan for VF={2,4},UF>=1' {
+; CHECK-NEXT: Live-in vp<%0> = VF * UF
+; CHECK-NEXT: Live-in vp<%1> = vector-trip-count
+; CHECK-NEXT: Live-in vp<%2> = backedge-taken count
+; CHECK-NEXT: Live-in ir<%N> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%16>
+; CHECK-NEXT:     WIDEN-INDUCTION %iv = phi 0, %iv.next, ir<1>
+; CHECK-NEXT:     EMIT vp<%5> = icmp ule ir<%iv>, vp<%2>
+; CHECK-NEXT:   Successor(s): pred.store
+; CHECK-EMPTY:
+; CHECK-NEXT:  <xVFxUF> pred.store: {
+; CHECK-NEXT:    pred.store.entry:
+; CHECK-NEXT:      BRANCH-ON-MASK vp<%5>
+; CHECK-NEXT:    Successor(s): pred.store.if, pred.store.continue
+; CHECK-EMPTY:
+; CHECK-NEXT:    pred.store.if:
+; CHECK-NEXT:      vp<%6> = SCALAR-STEPS vp<%3>, ir<1>
+; CHECK-NEXT:      REPLICATE ir<%arrayidx> = getelementptr inbounds ir<%b>, vp<%6>
+; CHECK-NEXT:      REPLICATE ir<%0> = load ir<%arrayidx>
+; CHECK-NEXT:      REPLICATE ir<%arrayidx2> = getelementptr inbounds ir<%c>, vp<%6>
+; CHECK-NEXT:      REPLICATE ir<%1> = load ir<%arrayidx2>
+; CHECK-NEXT:      REPLICATE ir<%arrayidx4> = getelementptr inbounds ir<%a>, vp<%6>
+; CHECK-NEXT:      REPLICATE ir<%add> = add nsw ir<%1>, ir<%0>
+; CHECK-NEXT:      REPLICATE store ir<%add>, ir<%arrayidx4>
+; CHECK-NEXT:    Successor(s): pred.store.continue
+; CHECK-EMPTY:
+; CHECK-NEXT:    pred.store.continue:
+; CHECK-NEXT:      PHI-PREDICATED-INSTRUCTION vp<%14> = ir<%0>
+; CHECK-NEXT:      PHI-PREDICATED-INSTRUCTION vp<%15> = ir<%1>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): for.body.2
+; CHECK-EMPTY:
+; CHECK-NEXT:  for.body.2:
+; CHECK-NEXT:     EMIT vp<%16> = add vp<%3>, vp<%0>
+; CHECK-NEXT:     EMIT branch-on-count vp<%16>, vp<%1>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
+define void @safe_dep(ptr %p) {
+; CHECK-LABEL: VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK-NEXT: Live-in vp<%0> = VF * UF
+; CHECK-NEXT: Live-in vp<%1> = vector-trip-count
+; CHECK-NEXT: Live-in ir<512> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%10>
+; CHECK-NEXT:     vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK-NEXT:     CLONE ir<%a1> = getelementptr ir<%p>, vp<%3>
+; CHECK-NEXT:     vp<%5> = vector-pointer ir<%a1>
+; CHECK-NEXT:     WIDEN ir<%v> = load vp<%5>
+; CHECK-NEXT:     CLONE ir<%offset> = add vp<%3>, ir<100>
+; CHECK-NEXT:     CLONE ir<%a2> = getelementptr ir<%p>, ir<%offset>
+; CHECK-NEXT:     vp<%9> = vector-pointer ir<%a2>
+; CHECK-NEXT:     WIDEN store vp<%9>, ir<%v>
+; CHECK-NEXT:     EMIT vp<%10> = add nuw vp<%2>, vp<%0>
+; CHECK-NEXT:     EMIT branch-on-count vp<%10>, vp<%1>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [0, %entry], [%iv.next, %loop]
+  %a1 = getelementptr i64, ptr %p, i64 %iv
+  %v = load i64, ptr %a1, align 32
+  %offset = add i64 %iv, 100
+  %a2 = getelementptr i64, ptr %p, i64 %offset
+  store i64 %v, ptr %a2, align 32
+  %iv.next = add i64 %iv, 1
+  %cmp = icmp ne i64 %iv, 511
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
index 57e1dc9..b876e9d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
@@ -1,11 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -mtriple riscv64-linux-gnu -mattr=+v,+d -passes=loop-vectorize < %s -S -o - | FileCheck %s -check-prefix=OUTLOOP
 ; RUN: opt -mtriple riscv64-linux-gnu -mattr=+v,+d -passes=loop-vectorize -prefer-inloop-reductions < %s -S -o - | FileCheck %s -check-prefix=INLOOP
-
+; RUN: opt -passes=loop-vectorize -force-tail-folding-style=data-with-evl -prefer-predicate-over-epilogue=predicate-dont-vectorize -mtriple=riscv64 -mattr=+v -S < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s
 
 target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
 target triple = "riscv64"
 
+; FIXME: inloop reductions are not supported yet with predicated vectorization.
+
 define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
 ; OUTLOOP-LABEL: @add_i16_i32(
 ; OUTLOOP-NEXT:  entry:
@@ -115,6 +117,70 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
 ; INLOOP-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; INLOOP-NEXT:    ret i32 [[R_0_LCSSA]]
 ;
+; IF-EVL-LABEL: @add_i16_i32(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; IF-EVL-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; IF-EVL:       for.body.preheader:
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP2]], 4
+; IF-EVL-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], [[TMP4]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[N]], 1
+; IF-EVL-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], 4
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP7:%.*]] = add i32 [[INDEX]], 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[INDEX]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; IF-EVL-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP8]]
+; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[TMP9]]
+; IF-EVL-NEXT:    [[TMP10:%.*]] = icmp ule <vscale x 4 x i32> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP7]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 0
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP12]], i32 2, <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i16> poison)
+; IF-EVL-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
+; IF-EVL-NEXT:    [[TMP14]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP13]]
+; IF-EVL-NEXT:    [[TMP15:%.*]] = select <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i32> [[TMP14]], <vscale x 4 x i32> [[VEC_PHI]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP6]]
+; IF-EVL-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP15]])
+; IF-EVL-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; IF-EVL-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; IF-EVL-NEXT:    [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_08]]
+; IF-EVL-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; IF-EVL-NEXT:    [[CONV:%.*]] = sext i16 [[TMP18]] to i32
+; IF-EVL-NEXT:    [[ADD]] = add nsw i32 [[R_07]], [[CONV]]
+; IF-EVL-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
+; IF-EVL-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL:       for.cond.cleanup.loopexit:
+; IF-EVL-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; IF-EVL:       for.cond.cleanup:
+; IF-EVL-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; IF-EVL-NEXT:    ret i32 [[R_0_LCSSA]]
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.cond.cleanup
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll
new file mode 100644
index 0000000..835ff37
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll
@@ -0,0 +1,116 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=IF-EVL
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=NO-VP
+
+define void @gather_scatter(ptr noalias %in, ptr noalias %out, ptr noalias %index, i64 %n) {
+; IF-EVL-LABEL: @gather_scatter(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 2
+; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 2
+; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; IF-EVL-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; IF-EVL-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP11]], zeroinitializer
+; IF-EVL-NEXT:    [[TMP13:%.*]] = mul <vscale x 2 x i64> [[TMP12]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; IF-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP13]]
+; IF-EVL-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 2
+; IF-EVL-NEXT:    [[TMP16:%.*]] = mul i64 1, [[TMP15]]
+; IF-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP16]], i64 0
+; IF-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP18:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP17]], i32 2, i1 true)
+; IF-EVL-NEXT:    [[TMP19:%.*]] = icmp ule <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[INDEX:%.*]], <vscale x 2 x i64> [[VEC_IND]]
+; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.vp.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP20]], <vscale x 2 x i1> [[TMP19]], i32 [[TMP18]])
+; IF-EVL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], <vscale x 2 x i64> [[WIDE_MASKED_GATHER]]
+; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 2 x float> @llvm.vp.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> align 4 [[TMP21]], <vscale x 2 x i1> [[TMP19]], i32 [[TMP18]])
+; IF-EVL-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], <vscale x 2 x i64> [[WIDE_MASKED_GATHER]]
+; IF-EVL-NEXT:    call void @llvm.vp.scatter.nxv2f32.nxv2p0(<vscale x 2 x float> [[WIDE_MASKED_GATHER2]], <vscale x 2 x ptr> align 4 [[TMP22]], <vscale x 2 x i1> [[TMP19]], i32 [[TMP18]])
+; IF-EVL-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP18]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX1]], [[TMP10]]
+; IF-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; IF-EVL-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[INDEX]], i64 [[INDVARS_IV]]
+; IF-EVL-NEXT:    [[TMP25:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8
+; IF-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[TMP25]]
+; IF-EVL-NEXT:    [[TMP26:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; IF-EVL-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[OUT]], i64 [[TMP25]]
+; IF-EVL-NEXT:    store float [[TMP26]], ptr [[ARRAYIDX7]], align 4
+; IF-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL:       for.end:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @gather_scatter(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[INDEX:%.*]], i64 [[INDVARS_IV]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8
+; NO-VP-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; NO-VP-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT:    store float [[TMP1]], ptr [[ARRAYIDX7]], align 4
+; NO-VP-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N:%.*]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; NO-VP:       for.end:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx3 = getelementptr inbounds i32, ptr %index, i64 %indvars.iv
+  %0 = load i64, ptr %arrayidx3, align 8
+  %arrayidx5 = getelementptr inbounds float, ptr %in, i64 %0
+  %1 = load float, ptr %arrayidx5, align 4
+  %arrayidx7 = getelementptr inbounds float, ptr %out, i64 %0
+  store float %1, ptr %arrayidx7, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
new file mode 100644
index 0000000..0b495bc
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck --check-prefix=IF-EVL %s
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck --check-prefix=NO-VP %s
+
+; FIXME: interleaved accesses are not supported yet with predicated vectorization.
+define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
+; IF-EVL-LABEL: @interleave(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    [[TMP17:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP31]], 8
+; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP17]], [[TMP2]]
+; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 8
+; IF-EVL-NEXT:    [[TMP32:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP32]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; IF-EVL-NEXT:    [[TMP11:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; IF-EVL-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i64> [[TMP11]], zeroinitializer
+; IF-EVL-NEXT:    [[TMP13:%.*]] = mul <vscale x 4 x i64> [[TMP12]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; IF-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP13]]
+; IF-EVL-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 4
+; IF-EVL-NEXT:    [[TMP37:%.*]] = mul i64 1, [[TMP15]]
+; IF-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP37]], i64 0
+; IF-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; IF-EVL-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; IF-EVL-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 4
+; IF-EVL-NEXT:    [[TMP38:%.*]] = add i64 [[TMP19]], 0
+; IF-EVL-NEXT:    [[TMP39:%.*]] = mul i64 [[TMP38]], 1
+; IF-EVL-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], [[TMP39]]
+; IF-EVL-NEXT:    [[TMP23:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; IF-EVL-NEXT:    [[TMP24:%.*]] = icmp ule <vscale x 4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
+; IF-EVL-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
+; IF-EVL-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[STEP_ADD]], i32 0
+; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP25]], i32 4, <vscale x 4 x i1> [[TMP23]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP26]], i32 4, <vscale x 4 x i1> [[TMP24]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[VEC_IND]], i32 1
+; IF-EVL-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[STEP_ADD]], i32 1
+; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP27]], i32 4, <vscale x 4 x i1> [[TMP23]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP28]], i32 4, <vscale x 4 x i1> [[TMP24]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT:    [[TMP29:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER3]], [[WIDE_MASKED_GATHER]]
+; IF-EVL-NEXT:    [[TMP30:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER4]], [[WIDE_MASKED_GATHER2]]
+; IF-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]]
+; IF-EVL-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; IF-EVL-NEXT:    [[TMP34:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP35:%.*]] = mul i64 [[TMP34]], 4
+; IF-EVL-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i64 [[TMP35]]
+; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP29]], ptr [[TMP33]], i32 4, <vscale x 4 x i1> [[TMP23]])
+; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP30]], ptr [[TMP36]], i32 4, <vscale x 4 x i1> [[TMP24]])
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[STEP_ADD]], [[DOTSPLAT]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 0
+; IF-EVL-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 1
+; IF-EVL-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP21]]
+; IF-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL:       for.cond.cleanup:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @interleave(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 16
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP:       vector.ph:
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-VP:       vector.body:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 8
+; NO-VP-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], i64 [[TMP10]], i32 0
+; NO-VP-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[TMP1]], i32 0
+; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+; NO-VP-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
+; NO-VP-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP4]], align 4
+; NO-VP-NEXT:    [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4
+; NO-VP-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; NO-VP-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; NO-VP-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; NO-VP-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; NO-VP-NEXT:    [[TMP6:%.*]] = add nsw <8 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = add nsw <8 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC2]]
+; NO-VP-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]]
+; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 0
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 8
+; NO-VP-NEXT:    store <8 x i32> [[TMP6]], ptr [[TMP12]], align 4
+; NO-VP-NEXT:    store <8 x i32> [[TMP7]], ptr [[TMP11]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; NO-VP-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP:       middle.block:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; NO-VP:       scalar.ph:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 0
+; NO-VP-NEXT:    [[TMP29:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 1
+; NO-VP-NEXT:    [[TMP30:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; NO-VP-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP30]], [[TMP29]]
+; NO-VP-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-VP:       for.cond.cleanup:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [2 x i32], ptr %b, i64 %iv, i32 0
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [2 x i32], ptr %b, i64 %iv, i32 1
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
+
+for.cond.cleanup:
+  ret void
+}
+
+!0 = distinct !{!0, !1, !2}
+!1 = !{!"llvm.loop.interleave.count", i32 2}
+!2 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-iv32.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-iv32.ll
new file mode 100644
index 0000000..d5ad99f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-iv32.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck --check-prefix=IF-EVL %s
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck --check-prefix=NO-VP %s
+
+define void @iv32(ptr noalias %a, ptr noalias %b, i32 %N) {
+; IF-EVL-LABEL: @iv32(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    [[TMP19:%.*]] = sub i32 -1, [[N:%.*]]
+; IF-EVL-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i32 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i32 [[TMP19]], [[TMP2]]
+; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], [[TMP8]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], 4
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_EVL_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i32 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[TMP11]], i32 4, i1 true)
+; IF-EVL-NEXT:    [[TMP13:%.*]] = add i32 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP_LOAD]], ptr align 4 [[TMP17]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i32 [[TMP12]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[IV_NEXT]] = add i32 [[IV]], [[TMP10]]
+; IF-EVL-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[IV_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY1:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[IV1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[IV1]]
+; IF-EVL-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV1]]
+; IF-EVL-NEXT:    store i32 [[TMP0]], ptr [[ARRAYIDX4]], align 4
+; IF-EVL-NEXT:    [[IV_NEXT1]] = add nuw nsw i32 [[IV1]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT1]], [[N]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY1]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL:       for.cond.cleanup:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @iv32(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; NO-VP-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP0]], 4
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], [[TMP10]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP:       vector.ph:
+; NO-VP-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vscale.i32()
+; NO-VP-NEXT:    [[TMP11:%.*]] = mul i32 [[TMP1]], 4
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], [[TMP11]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; NO-VP-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP2]], 4
+; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-VP:       vector.body:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 0
+; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[TMP3]]
+; NO-VP-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
+; NO-VP-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP3]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[WIDE_LOAD]], ptr [[TMP7]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP12]]
+; NO-VP-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP:       middle.block:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; NO-VP:       scalar.ph:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[IV]]
+; NO-VP-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]]
+; NO-VP-NEXT:    store i32 [[TMP9]], ptr [[ARRAYIDX4]], align 4
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-VP:       for.cond.cleanup:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i32 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i32 %iv
+  store i32 %0, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i32 %iv, 1
+  %exitcond.not = icmp eq i32 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll
new file mode 100644
index 0000000..203d0c9
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=IF-EVL
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=NO-VP
+
+define void @masked_loadstore(ptr noalias %a, ptr noalias %b, i64 %n) {
+; IF-EVL-LABEL: @masked_loadstore(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
+; IF-EVL-NEXT:    [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; IF-EVL-NEXT:    [[TMP15:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP14]]
+; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP15]]
+; IF-EVL-NEXT:    [[TMP16:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP19:%.*]] = icmp ne <vscale x 4 x i32> [[VP_OP_LOAD]], zeroinitializer
+; IF-EVL-NEXT:    [[TMP20:%.*]] = select <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i1> zeroinitializer
+; IF-EVL-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[TMP21]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD3:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP22]], <vscale x 4 x i1> [[TMP20]], i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP23:%.*]] = add <vscale x 4 x i32> [[VP_OP_LOAD]], [[VP_OP_LOAD3]]
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP23]], ptr align 4 [[TMP22]], <vscale x 4 x i1> [[TMP20]], i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP24]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[I_011:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I_011]]
+; IF-EVL-NEXT:    [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[TMP26]], 0
+; IF-EVL-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; IF-EVL:       if.then:
+; IF-EVL-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I_011]]
+; IF-EVL-NEXT:    [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
+; IF-EVL-NEXT:    [[ADD:%.*]] = add i32 [[TMP26]], [[TMP27]]
+; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX3]], align 4
+; IF-EVL-NEXT:    br label [[FOR_INC]]
+; IF-EVL:       for.inc:
+; IF-EVL-NEXT:    [[INC]] = add nuw nsw i64 [[I_011]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL:       exit:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @masked_loadstore(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[I_011:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[I_011]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[TMP0]], 0
+; NO-VP-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; NO-VP:       if.then:
+; NO-VP-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[I_011]]
+; NO-VP-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
+; NO-VP-NEXT:    [[ADD:%.*]] = add i32 [[TMP0]], [[TMP1]]
+; NO-VP-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX3]], align 4
+; NO-VP-NEXT:    br label [[FOR_INC]]
+; NO-VP:       for.inc:
+; NO-VP-NEXT:    [[INC]] = add nuw nsw i64 [[I_011]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N:%.*]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; NO-VP:       exit:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %i.011 = phi i64 [ %inc, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %i.011
+  %0 = load i32, ptr %arrayidx, align 4
+  %cmp1 = icmp ne i32 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %arrayidx3 = getelementptr inbounds i32, ptr %a, i64 %i.011
+  %1 = load i32, ptr %arrayidx3, align 4
+  %add = add i32 %0, %1
+  store i32 %add, ptr %arrayidx3, align 4
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i64 %i.011, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-no-masking.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-no-masking.ll
new file mode 100644
index 0000000..1c49fba
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-no-masking.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s
+
+; No need to emit predicated vector code if the vector instructions with masking are not required.
+define i32 @no_masking() {
+; CHECK-LABEL: @no_masking(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[BODY:%.*]]
+; CHECK:       body:
+; CHECK-NEXT:    [[P:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[BODY]] ]
+; CHECK-NEXT:    [[INC]] = add i32 [[P]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[BODY]]
+; CHECK:       end:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  br label %body
+
+body:
+  %p = phi i32 [ 1, %entry ], [ %inc, %body ]
+  %inc = add i32 %p, 1
+  %cmp = icmp eq i32 %inc, 0
+  br i1 %cmp, label %end, label %body
+
+end:
+  ret i32 0
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
new file mode 100644
index 0000000..f2222e0
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
@@ -0,0 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=IF-EVL
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=NO-VP
+
+; FIXME: reversed loads/stores are not supported yet with predicated vectorization.
+define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %ptr2) {
+; IF-EVL-LABEL: @reverse_load_store(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; IF-EVL-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]]
+; IF-EVL-NEXT:    [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32
+; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[INDEX]]
+; IF-EVL-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[INDEX]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; IF-EVL-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
+; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
+; IF-EVL-NEXT:    [[TMP10:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1023, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; IF-EVL-NEXT:    [[TMP11:%.*]] = add i64 [[TMP7]], -1
+; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP11]]
+; IF-EVL-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 4
+; IF-EVL-NEXT:    [[TMP15:%.*]] = mul i64 0, [[TMP14]]
+; IF-EVL-NEXT:    [[TMP16:%.*]] = sub i64 1, [[TMP14]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 [[TMP15]]
+; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP16]]
+; IF-EVL-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP10]])
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP18]], i32 4, <vscale x 4 x i1> [[REVERSE]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]])
+; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP11]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 4
+; IF-EVL-NEXT:    [[TMP22:%.*]] = mul i64 0, [[TMP21]]
+; IF-EVL-NEXT:    [[TMP23:%.*]] = sub i64 1, [[TMP21]]
+; IF-EVL-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP22]]
+; IF-EVL-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP23]]
+; IF-EVL-NEXT:    [[REVERSE4:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP10]])
+; IF-EVL-NEXT:    [[REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[REVERSE3]])
+; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[REVERSE5]], ptr [[TMP25]], i32 4, <vscale x 4 x i1> [[REVERSE4]])
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
+; IF-EVL-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[LOOPEND:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[STARTVAL]], [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[ADD_PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[ADD]] = add i64 [[ADD_PHI]], -1
+; IF-EVL-NEXT:    [[GEPL:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[ADD]]
+; IF-EVL-NEXT:    [[TMP:%.*]] = load i32, ptr [[GEPL]], align 4
+; IF-EVL-NEXT:    [[GEPS:%.*]] = getelementptr inbounds i32, ptr [[PTR2]], i64 [[ADD]]
+; IF-EVL-NEXT:    store i32 [[TMP]], ptr [[GEPS]], align 4
+; IF-EVL-NEXT:    [[INC]] = add i32 [[I]], 1
+; IF-EVL-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[INC]], 1024
+; IF-EVL-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[LOOPEND]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL:       loopend:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @reverse_load_store(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[ADD_PHI:%.*]] = phi i64 [ [[STARTVAL:%.*]], [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[ADD]] = add i64 [[ADD_PHI]], -1
+; NO-VP-NEXT:    [[GEPL:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[ADD]]
+; NO-VP-NEXT:    [[TMP:%.*]] = load i32, ptr [[GEPL]], align 4
+; NO-VP-NEXT:    [[GEPS:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[ADD]]
+; NO-VP-NEXT:    store i32 [[TMP]], ptr [[GEPS]], align 4
+; NO-VP-NEXT:    [[INC]] = add i32 [[I]], 1
+; NO-VP-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[INC]], 1024
+; NO-VP-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[LOOPEND:%.*]]
+; NO-VP:       loopend:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %add.phi = phi i64 [ %startval, %entry ], [ %add, %for.body ]
+  %i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %add = add i64 %add.phi, -1
+  %gepl = getelementptr inbounds i32, ptr %ptr, i64 %add
+  %tmp = load i32, ptr %gepl, align 4
+  %geps = getelementptr inbounds i32, ptr %ptr2, i64 %add
+  store i32 %tmp, ptr %geps, align 4
+  %inc = add i32 %i, 1
+  %exitcond = icmp ne i32 %inc, 1024
+  br i1 %exitcond, label %for.body, label %loopend
+
+loopend:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll
new file mode 100644
index 0000000..c69bb17
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll
@@ -0,0 +1,142 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck --check-prefix=IF-EVL %s
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck --check-prefix=NO-VP %s
+
+define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; IF-EVL-LABEL: @foo(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
+; IF-EVL-NEXT:    [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP18:%.*]] = add nsw <vscale x 4 x i32> [[VP_OP_LOAD1]], [[VP_OP_LOAD]]
+; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP18]], ptr align 4 [[TMP20]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; IF-EVL-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; IF-EVL-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP24]], [[TMP23]]
+; IF-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL:       for.cond.cleanup:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @foo(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP:       vector.ph:
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-VP:       vector.body:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP6]]
+; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP6]]
+; NO-VP-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; NO-VP-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP10]], align 4
+; NO-VP-NEXT:    [[TMP11:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
+; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP11]], ptr [[TMP13]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP:       middle.block:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; NO-VP:       scalar.ph:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; NO-VP-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; NO-VP-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; NO-VP-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP15]]
+; NO-VP-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-VP:       for.cond.cleanup:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
new file mode 100644
index 0000000..72b881b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
@@ -0,0 +1,134 @@
+; REQUIRES: asserts
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=IF-EVL,CHECK %s
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=NO-VP,CHECK %s
+
+define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
+; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
+; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
+; IF-EVL-EMPTY:
+; IF-EVL:      vector.ph:
+; IF-EVL-NEXT: Successor(s): vector loop
+; IF-EVL-EMPTY:
+; IF-EVL-NEXT: <x1> vector loop: {
+; IF-EVL-NEXT:  vector.body:
+; IF-EVL-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+; IF-EVL-NEXT:    EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%[0-9]+]]>
+; IF-EVL-NEXT:    EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[EVL_PHI]]>, ir<%N>
+; IF-EVL-NEXT:    vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>
+; IF-EVL-NEXT:    CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
+; IF-EVL-NEXT:    vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
+; IF-EVL-NEXT:    WIDEN ir<[[LD1:%.+]]> = load vp<[[PTR1]]>, ir<true>
+; IF-EVL-NEXT:    CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
+; IF-EVL-NEXT:    vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
+; IF-EVL-NEXT:    WIDEN ir<[[LD2:%.+]]> = load vp<[[PTR2]]>, ir<true>
+; IF-EVL-NEXT:    WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]>
+; IF-EVL-NEXT:    CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
+; IF-EVL-NEXT:    vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
+; IF-EVL-NEXT:    WIDEN store vp<[[PTR3]]>, ir<[[ADD]]>, ir<true>
+; IF-EVL-NEXT:    SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:    EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
+; IF-EVL-NEXT:    EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
+; IF-EVL-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
+; IF-EVL-NEXT:  No successors
+; IF-EVL-NEXT: }
+
+; NO-VP: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' {
+; NO-VP-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
+; NO-VP-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; NO-VP-NEXT: Live-in ir<%N> = original trip-count
+; NO-VP-EMPTY:
+; NO-VP:      vector.ph:
+; NO-VP-NEXT: Successor(s): vector loop
+; NO-VP-EMPTY:
+; NO-VP-NEXT: <x1> vector loop: {
+; NO-VP-NEXT:  vector.body:
+; NO-VP-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+; NO-VP-NEXT:    vp<[[ST:%[0-9]+]]>    = SCALAR-STEPS vp<[[IV]]>, ir<1>
+; NO-VP-NEXT:    CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
+; NO-VP-NEXT:    vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
+; NO-VP-NEXT:    WIDEN ir<[[LD1:%.+]]> = load vp<[[PTR1]]>
+; NO-VP-NEXT:    CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
+; NO-VP-NEXT:    vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
+; NO-VP-NEXT:    WIDEN ir<[[LD2:%.+]]> = load vp<[[PTR2]]>
+; NO-VP-NEXT:    WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]>
+; NO-VP-NEXT:    CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
+; NO-VP-NEXT:    vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
+; NO-VP-NEXT:    WIDEN store vp<[[PTR3]]>, ir<[[ADD]]>
+; NO-VP-NEXT:    EMIT vp<[[IV_NEXT:%[0-9]+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]>
+; NO-VP-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT]]>, vp<[[VTC]]>
+; NO-VP-NEXT:  No successors
+; NO-VP-NEXT: }
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
+define void @safe_dep(ptr %p) {
+; CHECK: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2},UF>=1' {
+; CHECK-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<512> = original trip-count
+; CHECK-EMPTY:
+; CHECK:      vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:  vector.body:
+; CHECK-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+; CHECK-NEXT:    vp<[[ST:%[0-9]+]]>    = SCALAR-STEPS vp<[[IV]]>, ir<1>
+; CHECK-NEXT:    CLONE ir<[[GEP1:%.+]]> = getelementptr ir<%p>, vp<[[ST]]>
+; CHECK-NEXT:    vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
+; CHECK-NEXT:    WIDEN ir<[[V:%.+]]> = load vp<[[PTR1]]>
+; CHECK-NEXT:    CLONE ir<[[OFFSET:.+]]> = add vp<[[ST]]>, ir<100>
+; CHECK-NEXT:    CLONE ir<[[GEP2:%.+]]> = getelementptr ir<%p>, ir<[[OFFSET]]>
+; CHECK-NEXT:    vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
+; CHECK-NEXT:    WIDEN store vp<[[PTR2]]>, ir<[[V]]>
+; CHECK-NEXT:    EMIT vp<[[IV_NEXT:%[0-9]+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]>
+; CHECK-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT]]>, vp<[[VTC]]>
+; CHECK-NEXT:  No successors
+; CHECK-NEXT: }
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [0, %entry], [%iv.next, %loop]
+  %a1 = getelementptr i64, ptr %p, i64 %iv
+  %v = load i64, ptr %a1, align 32
+  %offset = add i64 %iv, 100
+  %a2 = getelementptr i64, ptr %p, i64 %offset
+  store i64 %v, ptr %a2, align 32
+  %iv.next = add i64 %iv, 1
+  %cmp = icmp ne i64 %iv, 511
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll
new file mode 100644
index 0000000..1cf71360
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll
@@ -0,0 +1,191 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=x86_64 -mattr=+avx512f -S < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -mtriple=x86_64 -mattr=+avx512f -S < %s 2>&1 | FileCheck --check-prefix=NO-VP %s
+
+define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; IF-EVL-LABEL: @foo(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 15
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[INDEX]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <16 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
+; IF-EVL-NEXT:    [[TMP1:%.*]] = icmp ule <16 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP3]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison)
+; IF-EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP5]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison)
+; IF-EVL-NEXT:    [[TMP6:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]]
+; IF-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; IF-EVL-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP6]], ptr [[TMP8]], i32 4, <16 x i1> [[TMP1]])
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; IF-EVL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; IF-EVL-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP10]]
+; IF-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL:       for.cond.cleanup:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @foo(
+; NO-VP-NEXT:  iter.check:
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; NO-VP:       vector.main.loop.iter.check:
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 64
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP:       vector.ph:
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 64
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-VP:       vector.body:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 16
+; NO-VP-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 32
+; NO-VP-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 48
+; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]]
+; NO-VP-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP2]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP3]]
+; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 16
+; NO-VP-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 32
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 48
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i32>, ptr [[TMP9]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i32>, ptr [[TMP10]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i32>, ptr [[TMP11]], align 4
+; NO-VP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP1]]
+; NO-VP-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP2]]
+; NO-VP-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP3]]
+; NO-VP-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
+; NO-VP-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 16
+; NO-VP-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 32
+; NO-VP-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 48
+; NO-VP-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i32>, ptr [[TMP16]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i32>, ptr [[TMP17]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i32>, ptr [[TMP18]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i32>, ptr [[TMP19]], align 4
+; NO-VP-NEXT:    [[TMP20:%.*]] = add nsw <16 x i32> [[WIDE_LOAD5]], [[WIDE_LOAD]]
+; NO-VP-NEXT:    [[TMP21:%.*]] = add nsw <16 x i32> [[WIDE_LOAD6]], [[WIDE_LOAD2]]
+; NO-VP-NEXT:    [[TMP22:%.*]] = add nsw <16 x i32> [[WIDE_LOAD7]], [[WIDE_LOAD3]]
+; NO-VP-NEXT:    [[TMP23:%.*]] = add nsw <16 x i32> [[WIDE_LOAD8]], [[WIDE_LOAD4]]
+; NO-VP-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]]
+; NO-VP-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP2]]
+; NO-VP-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP3]]
+; NO-VP-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 0
+; NO-VP-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 16
+; NO-VP-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 32
+; NO-VP-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 48
+; NO-VP-NEXT:    store <16 x i32> [[TMP20]], ptr [[TMP28]], align 4
+; NO-VP-NEXT:    store <16 x i32> [[TMP21]], ptr [[TMP29]], align 4
+; NO-VP-NEXT:    store <16 x i32> [[TMP22]], ptr [[TMP30]], align 4
+; NO-VP-NEXT:    store <16 x i32> [[TMP23]], ptr [[TMP31]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
+; NO-VP-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP:       middle.block:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; NO-VP:       vec.epilog.iter.check:
+; NO-VP-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
+; NO-VP-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; NO-VP:       vec.epilog.ph:
+; NO-VP-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; NO-VP-NEXT:    [[N_MOD_VF9:%.*]] = urem i64 [[N]], 8
+; NO-VP-NEXT:    [[N_VEC10:%.*]] = sub i64 [[N]], [[N_MOD_VF9]]
+; NO-VP-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; NO-VP:       vec.epilog.vector.body:
+; NO-VP-NEXT:    [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP33:%.*]] = add i64 [[INDEX12]], 0
+; NO-VP-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP33]]
+; NO-VP-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP34]], i32 0
+; NO-VP-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x i32>, ptr [[TMP35]], align 4
+; NO-VP-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP33]]
+; NO-VP-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[TMP36]], i32 0
+; NO-VP-NEXT:    [[WIDE_LOAD14:%.*]] = load <8 x i32>, ptr [[TMP37]], align 4
+; NO-VP-NEXT:    [[TMP38:%.*]] = add nsw <8 x i32> [[WIDE_LOAD14]], [[WIDE_LOAD13]]
+; NO-VP-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP33]]
+; NO-VP-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i32 0
+; NO-VP-NEXT:    store <8 x i32> [[TMP38]], ptr [[TMP40]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[INDEX12]], 8
+; NO-VP-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC10]]
+; NO-VP-NEXT:    br i1 [[TMP41]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-VP:       vec.epilog.middle.block:
+; NO-VP-NEXT:    [[CMP_N11:%.*]] = icmp eq i64 [[N]], [[N_VEC10]]
+; NO-VP-NEXT:    br i1 [[CMP_N11]], label [[FOR_COND_CLEANUP]], label [[VEC_EPILOG_SCALAR_PH]]
+; NO-VP:       vec.epilog.scalar.ph:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; NO-VP-NEXT:    [[TMP42:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; NO-VP-NEXT:    [[TMP43:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; NO-VP-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP43]], [[TMP42]]
+; NO-VP-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; NO-VP:       for.cond.cleanup:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll
new file mode 100644
index 0000000..9b49d44
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll
@@ -0,0 +1,89 @@
+; REQUIRES: asserts
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=x86_64 -mattr=+avx512f -disable-output < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -mtriple=x86_64 -mattr=+avx512f -disable-output < %s 2>&1 | FileCheck --check-prefix=NO-VP %s
+
+define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; IF-EVL: VPlan 'Initial VPlan for VF={4},UF>=1' {
+; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
+; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; IF-EVL-NEXT: Live-in vp<[[BETC:%[0-9]+]]> = backedge-taken count
+; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
+; IF-EVL-EMPTY:
+; IF-EVL:      vector.ph:
+; IF-EVL-NEXT: Successor(s): vector loop
+; IF-EVL-EMPTY:
+; IF-EVL-NEXT: <x1> vector loop: {
+; IF-EVL-NEXT:  vector.body:
+; IF-EVL-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+; IF-EVL-NEXT:    vp<[[ST:%[0-9]+]]>    = SCALAR-STEPS vp<[[IV]]>, ir<1>
+; IF-EVL-NEXT:    EMIT vp<[[VIV:%[0-9]+]]> = WIDEN-CANONICAL-INDUCTION vp<[[IV]]>
+; IF-EVL-NEXT:    EMIT vp<[[MASK:%[0-9]+]]> = icmp ule vp<[[VIV]]>, vp<[[BETC]]>
+; IF-EVL-NEXT:    CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
+; IF-EVL-NEXT:    vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
+; IF-EVL-NEXT:    WIDEN ir<[[LD1:%.+]]> = load vp<[[PTR1]]>, vp<[[MASK]]>
+; IF-EVL-NEXT:    CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
+; IF-EVL-NEXT:    vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
+; IF-EVL-NEXT:    WIDEN ir<[[LD2:%.+]]> = load vp<[[PTR2]]>, vp<[[MASK]]>
+; IF-EVL-NEXT:    WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]>
+; IF-EVL-NEXT:    CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
+; IF-EVL-NEXT:    vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
+; IF-EVL-NEXT:    WIDEN store vp<[[PTR3]]>, ir<[[ADD]]>, vp<[[MASK]]>
+; IF-EVL-NEXT:    EMIT vp<[[IV_NEXT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
+; IF-EVL-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT]]>, vp<[[VTC]]>
+; IF-EVL-NEXT:  No successors
+; IF-EVL-NEXT: }
+
+; NO-VP: VPlan 'Initial VPlan for VF={4},UF>=1' {
+; NO-VP-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
+; NO-VP-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; NO-VP-NEXT: Live-in ir<%N> = original trip-count
+; NO-VP-EMPTY:
+; NO-VP:      vector.ph:
+; NO-VP-NEXT: Successor(s): vector loop
+; NO-VP-EMPTY:
+; NO-VP-NEXT: <x1> vector loop: {
+; NO-VP-NEXT:  vector.body:
+; NO-VP-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+; NO-VP-NEXT:    vp<[[ST:%[0-9]+]]>    = SCALAR-STEPS vp<[[IV]]>, ir<1>
+; NO-VP-NEXT:    CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
+; NO-VP-NEXT:    vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
+; NO-VP-NEXT:    WIDEN ir<[[LD1:%.+]]> = load vp<[[PTR1]]>
+; NO-VP-NEXT:    CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
+; NO-VP-NEXT:    vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
+; NO-VP-NEXT:    WIDEN ir<[[LD2:%.+]]> = load vp<[[PTR2]]>
+; NO-VP-NEXT:    WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]>
+; NO-VP-NEXT:    CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
+; NO-VP-NEXT:    vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
+; NO-VP-NEXT:    WIDEN store vp<[[PTR3]]>, ir<[[ADD]]>
+; NO-VP-NEXT:    EMIT vp<[[IV_NEXT:%[0-9]+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]>
+; NO-VP-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT]]>, vp<[[VTC]]>
+; NO-VP-NEXT:  No successors
+; NO-VP-NEXT: }
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
new file mode 100644
index 0000000..0b87270
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
@@ -0,0 +1,222 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -p loop-vectorize -force-vector-width=4 -enable-epilogue-vectorization -epilogue-vectorization-force-VF=4 -S %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+define i32 @any_of_reduction_epilog(ptr %src, i64 %N) {
+; CHECK-LABEL: define i32 @any_of_reduction_epilog(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  iter.check:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8]] = select <4 x i1> [[TMP4]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i32 1, i32 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK:       vec.epilog.ph:
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF2]]
+; CHECK-NEXT:    [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[BC_MERGE_RDX]], i64 0
+; CHECK-NEXT:    [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i32> [[MINMAX_IDENT_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK:       vec.epilog.vector.body:
+; CHECK-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[MINMAX_IDENT_SPLAT]], [[VEC_EPILOG_PH]] ], [ [[TMP17:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX5]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP12]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD7]], zeroinitializer
+; CHECK-NEXT:    [[TMP17]] = select <4 x i1> [[TMP13]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> [[VEC_PHI6]]
+; CHECK-NEXT:    [[INDEX_NEXT8]] = add nuw i64 [[INDEX5]], 4
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC3]]
+; CHECK-NEXT:    br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       vec.epilog.middle.block:
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <4 x i32> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP14]])
+; CHECK-NEXT:    [[RDX_SELECT9:%.*]] = select i1 [[TMP16]], i32 1, i32 0
+; CHECK-NEXT:    [[CMP_N4:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]]
+; CHECK-NEXT:    br i1 [[CMP_N4]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       vec.epilog.scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX10:%.*]] = phi i32 [ 0, [[ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[RDX_SELECT9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX10]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[SELECT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp eq i8 [[LOAD]], 0
+; CHECK-NEXT:    [[SELECT]] = select i1 [[ICMP]], i32 1, i32 [[RED]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[ICMP3:%.*]] = icmp eq i64 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[ICMP3]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[SELECT_LCSSA:%.*]] = phi i32 [ [[SELECT]], [[LOOP]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[RDX_SELECT9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[SELECT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %red = phi i32 [ 0, %entry ], [ %select, %loop ]
+  %gep = getelementptr inbounds i8, ptr %src, i64 %iv
+  %load = load i8, ptr %gep, align 1
+  %icmp = icmp eq i8 %load, 0
+  %select = select i1 %icmp, i32 1, i32 %red
+  %iv.next = add i64 %iv, 1
+  %icmp3 = icmp eq i64 %iv, %N
+  br i1 %icmp3, label %exit, label %loop
+
+exit:
+  ret i32 %select
+}
+
+
+define i1 @any_of_reduction_i1_epilog(i64 %N, i32 %a) {
+; CHECK-LABEL: define i1 @any_of_reduction_i1_epilog(
+; CHECK-SAME: i64 [[N:%.*]], i32 [[A:%.*]]) {
+; CHECK-NEXT:  iter.check:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP3]] = select <4 x i1> [[TMP1]], <4 x i1> [[VEC_PHI]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i1> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]])
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP6]], i1 false, i1 false
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    [[IND_END6:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK:       vec.epilog.ph:
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i1 [ false, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF2]]
+; CHECK-NEXT:    [[IND_END5:%.*]] = trunc i64 [[N_VEC3]] to i32
+; CHECK-NEXT:    [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[BC_MERGE_RDX]], i64 0
+; CHECK-NEXT:    [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i1> [[MINMAX_IDENT_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[BC_RESUME_VAL]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT13:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT14:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT13]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK:       vec.epilog.vector.body:
+; CHECK-NEXT:    [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI10:%.*]] = phi <4 x i1> [ [[MINMAX_IDENT_SPLAT]], [[VEC_EPILOG_PH]] ], [ [[TMP10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND11:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i32> [[VEC_IND11]], [[BROADCAST_SPLAT14]]
+; CHECK-NEXT:    [[TMP10]] = select <4 x i1> [[TMP8]], <4 x i1> [[VEC_PHI10]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[INDEX9]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT12]] = add <4 x i32> [[VEC_IND11]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC3]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       vec.epilog.middle.block:
+; CHECK-NEXT:    [[RDX_SELECT_CMP16:%.*]] = icmp ne <4 x i1> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP16]])
+; CHECK-NEXT:    [[RDX_SELECT16:%.*]] = select i1 [[TMP13]], i1 false, i1 false
+; CHECK-NEXT:    [[CMP_N8:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]]
+; CHECK-NEXT:    br i1 [[CMP_N8]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       vec.epilog.scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL7:%.*]] = phi i32 [ [[IND_END5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END6]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX17:%.*]] = phi i1 [ false, [[ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[RDX_SELECT16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL4]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[RED_I1:%.*]] = phi i1 [ [[BC_MERGE_RDX17]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL7]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[CMP_1:%.*]] = icmp eq i32 [[IV_2]], [[A]]
+; CHECK-NEXT:    [[SEL]] = select i1 [[CMP_1]], i1 [[RED_I1]], i1 false
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[IV_2_NEXT]] = add i32 [[IV_2]], 1
+; CHECK-NEXT:    [[CMP_2:%.*]] = icmp eq i64 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_2]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[SEL_LCSSA:%.*]] = phi i1 [ [[SEL]], [[LOOP]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[RDX_SELECT16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i1 [[SEL_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %red.i1 = phi i1 [ false, %entry ], [ %sel, %loop ]
+  %iv.2 = phi i32 [ 0, %entry ], [ %iv.2.next, %loop ]
+  %cmp.1 = icmp eq i32 %iv.2, %a
+  %sel = select i1 %cmp.1, i1 %red.i1, i1 false
+  %iv.next = add i64 %iv, 1
+  %iv.2.next = add i32 %iv.2, 1
+  %cmp.2 = icmp eq i64 %iv, %N
+  br i1 %cmp.2, label %exit, label %loop
+
+exit:
+  ret i1 %sel
+
+; uselistorder directives
+  uselistorder i1 %sel, { 1, 0 }
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/vectorize-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/vectorize-force-tail-with-evl.ll
new file mode 100644
index 0000000..a90b38c
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/vectorize-force-tail-with-evl.ll
@@ -0,0 +1,101 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -force-target-supports-scalable-vectors -scalable-vectorization=on -S < %s | FileCheck --check-prefix=IF-EVL %s
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -force-vector-width=4 \
+; RUN: -force-target-supports-scalable-vectors -scalable-vectorization=on -S < %s | FileCheck --check-prefix=NO-VP %s
+
+; The target does not support predicated vectorization.
+define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; IF-EVL-LABEL: @foo(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[IV]]
+; IF-EVL-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[IV]]
+; IF-EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
+; IF-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; IF-EVL:       for.cond.cleanup:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @foo(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP8]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP:       vector.ph:
+; NO-VP-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP1]], 4
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP14]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-VP:       vector.body:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP3]]
+; NO-VP-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
+; NO-VP-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP3]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+; NO-VP-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 4
+; NO-VP-NEXT:    [[TMP16:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
+; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP3]]
+; NO-VP-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP16]], ptr [[TMP10]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
+; NO-VP-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP:       middle.block:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; NO-VP:       scalar.ph:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; NO-VP-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; NO-VP-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]]
+; NO-VP-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-VP:       for.cond.cleanup:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/vplan-force-tail-with-evl.ll
new file mode 100644
index 0000000..f510d47
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/vplan-force-tail-with-evl.ll
@@ -0,0 +1,37 @@
+; REQUIRES: asserts
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl -force-vector-width=4 \
+; RUN: -force-target-supports-scalable-vectors -scalable-vectorization=on \
+; RUN: -disable-output < %s 2>&1 | FileCheck --check-prefixes=NO-VP %s
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -force-target-supports-scalable-vectors -scalable-vectorization=on \
+; RUN: -disable-output < %s 2>&1 | FileCheck --check-prefixes=NO-VP %s
+
+; The target does not support predicated vectorization.
+define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; NO-VP-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
diff --git a/llvm/test/Transforms/PGOProfile/vtable_profile.ll b/llvm/test/Transforms/PGOProfile/vtable_profile.ll
index a844003..aae1e2d 100644
--- a/llvm/test/Transforms/PGOProfile/vtable_profile.ll
+++ b/llvm/test/Transforms/PGOProfile/vtable_profile.ll
@@ -1,9 +1,6 @@
 ; RUN: opt < %s -passes=pgo-instr-gen -enable-vtable-value-profiling -S 2>&1 | FileCheck %s --check-prefix=GEN --implicit-check-not="VTable value profiling is presently not supported"
 ; RUN: opt < %s -passes=pgo-instr-gen,instrprof -enable-vtable-value-profiling -S 2>&1 | FileCheck %s --check-prefix=LOWER --implicit-check-not="VTable value profiling is presently not supported"
 
-; __llvm_prf_vnm stores zlib-compressed vtable names.
-; REQUIRES: zlib
-
 source_filename = "vtable_local.ll"
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -59,7 +56,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; LOWER: $"__profvt_vtable_local.ll;_ZTVN12_GLOBAL__N_15Base2E" = comdat nodeduplicate
 ; LOWER: @__profvt__ZTV7Derived = global { i64, ptr, i32 } { i64 -4576307468236080025, ptr @_ZTV7Derived, i32 48 }, section "__llvm_prf_vtab", comdat, align 8
 ; LOWER: @"__profvt_vtable_local.ll;_ZTVN12_GLOBAL__N_15Base2E" = internal global { i64, ptr, i32 } { i64 1419990121885302679, ptr @_ZTVN12_GLOBAL__N_15Base2E, i32 24 }, section "__llvm_prf_vtab", comdat, align 8
-; LOWER: @__llvm_prf_vnm = private constant [64 x i8] c"7>x\DA\8B\8F\0A\093wI-\CA,KMa,+IL\CAI\8D\CF\C9ON\CC\D1\CB\C9\B1\8E\07J\FA\19\1A\C5\BB\FB\F8;9\FA\C4\C7\FB\C5\1B\9A:%\16\A7\1A\B9\02\00\19:\12o", section "__llvm_prf_vns", align 1
+; LOWER: @__llvm_prf_vnm = private constant {{.*}}, section "__llvm_prf_vns", align 1
 ; LOWER: @llvm.used = appending global [5 x ptr] [ptr @__profvt__ZTV7Derived, ptr @"__profvt_vtable_local.ll;_ZTVN12_GLOBAL__N_15Base2E", ptr @__llvm_prf_vnodes, ptr @__llvm_prf_nm, ptr @__llvm_prf_vnm], section "llvm.metadata"
 
 define i32 @_Z4funci(i32 %a) {
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr67803.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr67803.ll
index 495ec0a..45e411d 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/pr67803.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pr67803.ll
@@ -9,11 +9,7 @@ define <4 x i64> @PR67803(<4 x i64> %x, <4 x i64> %y, <4 x i64> %a, <4 x i64> %b
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i64> [[X:%.*]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i64> [[Y:%.*]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <8 x i32> [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[CMP_I21:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[SEXT_I22:%.*]] = sext <4 x i1> [[CMP_I21]] to <4 x i32>
-; CHECK-NEXT:    [[CMP_I:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[SEXT_I22]], <4 x i32> [[SEXT_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[A:%.*]] to <32 x i8>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <32 x i8> [[TMP5]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[B:%.*]] to <32 x i8>
diff --git a/llvm/test/Transforms/RemoveTraps/remove-traps.ll b/llvm/test/Transforms/RemoveTraps/remove-traps.ll
index 80b86e0..c8d5fec 100644
--- a/llvm/test/Transforms/RemoveTraps/remove-traps.ll
+++ b/llvm/test/Transforms/RemoveTraps/remove-traps.ll
@@ -1,18 +1,20 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes='function(remove-traps)' -S | FileCheck %s --check-prefixes=NOPROFILE
 ; RUN: opt < %s -passes='function(remove-traps)' -remove-traps-random-rate=1 -S | FileCheck %s --check-prefixes=ALL
-; RUN: opt < %s -passes='require<profile-summary>,function(remove-traps)' -S | FileCheck %s --check-prefixes=HOT
+; RUN: opt < %s -passes='require<profile-summary>,function(remove-traps)' -remove-traps-percentile-cutoff-hot=990000 -S | FileCheck %s --check-prefixes=HOT99
 ; RUN: opt < %s -passes='require<profile-summary>,function(remove-traps)' -remove-traps-percentile-cutoff-hot=700000 -S | FileCheck %s --check-prefixes=HOT70
 
 target triple = "x86_64-pc-linux-gnu"
 
 declare void @llvm.ubsantrap(i8 immarg)
+declare i1 @llvm.allow.ubsan.check(i8 immarg)
 
 define dso_local noundef i32 @simple(ptr noundef readonly %0) {
 ; NOPROFILE-LABEL: define dso_local noundef i32 @simple(
 ; NOPROFILE-SAME: ptr noundef readonly [[TMP0:%.*]]) {
 ; NOPROFILE-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; NOPROFILE-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], false
+; NOPROFILE-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; NOPROFILE-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
 ; NOPROFILE-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; NOPROFILE:       3:
 ; NOPROFILE-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -24,7 +26,8 @@ define dso_local noundef i32 @simple(ptr noundef readonly %0) {
 ; ALL-LABEL: define dso_local noundef i32 @simple(
 ; ALL-SAME: ptr noundef readonly [[TMP0:%.*]]) {
 ; ALL-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; ALL-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], true
+; ALL-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; ALL-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
 ; ALL-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; ALL:       3:
 ; ALL-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -33,22 +36,24 @@ define dso_local noundef i32 @simple(ptr noundef readonly %0) {
 ; ALL-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
 ; ALL-NEXT:    ret i32 [[TMP5]]
 ;
-; HOT-LABEL: define dso_local noundef i32 @simple(
-; HOT-SAME: ptr noundef readonly [[TMP0:%.*]]) {
-; HOT-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; HOT-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], false
-; HOT-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
-; HOT:       3:
-; HOT-NEXT:    tail call void @llvm.ubsantrap(i8 22)
-; HOT-NEXT:    unreachable
-; HOT:       4:
-; HOT-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
-; HOT-NEXT:    ret i32 [[TMP5]]
+; HOT99-LABEL: define dso_local noundef i32 @simple(
+; HOT99-SAME: ptr noundef readonly [[TMP0:%.*]]) {
+; HOT99-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
+; HOT99-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; HOT99-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
+; HOT99-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; HOT99:       3:
+; HOT99-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; HOT99-NEXT:    unreachable
+; HOT99:       4:
+; HOT99-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
+; HOT99-NEXT:    ret i32 [[TMP5]]
 ;
 ; HOT70-LABEL: define dso_local noundef i32 @simple(
 ; HOT70-SAME: ptr noundef readonly [[TMP0:%.*]]) {
 ; HOT70-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; HOT70-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], false
+; HOT70-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; HOT70-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
 ; HOT70-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; HOT70:       3:
 ; HOT70-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -58,7 +63,8 @@ define dso_local noundef i32 @simple(ptr noundef readonly %0) {
 ; HOT70-NEXT:    ret i32 [[TMP5]]
 ;
   %chk = icmp eq ptr %0, null
-  %hot = call i1 @llvm.experimental.hot()
+  %allow = call i1 @llvm.allow.ubsan.check(i8 22)
+  %hot = xor i1 %allow, true
   %2 = or i1 %chk, %hot
   br i1 %2, label %3, label %4
 
@@ -76,7 +82,8 @@ define dso_local noundef i32 @hot(ptr noundef readonly %0) !prof !36 {
 ; NOPROFILE-LABEL: define dso_local noundef i32 @hot(
 ; NOPROFILE-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF16:![0-9]+]] {
 ; NOPROFILE-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; NOPROFILE-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], false
+; NOPROFILE-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; NOPROFILE-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
 ; NOPROFILE-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; NOPROFILE:       3:
 ; NOPROFILE-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -88,7 +95,8 @@ define dso_local noundef i32 @hot(ptr noundef readonly %0) !prof !36 {
 ; ALL-LABEL: define dso_local noundef i32 @hot(
 ; ALL-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF16:![0-9]+]] {
 ; ALL-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; ALL-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], true
+; ALL-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; ALL-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
 ; ALL-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; ALL:       3:
 ; ALL-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -97,22 +105,24 @@ define dso_local noundef i32 @hot(ptr noundef readonly %0) !prof !36 {
 ; ALL-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
 ; ALL-NEXT:    ret i32 [[TMP5]]
 ;
-; HOT-LABEL: define dso_local noundef i32 @hot(
-; HOT-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF16:![0-9]+]] {
-; HOT-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; HOT-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], true
-; HOT-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
-; HOT:       3:
-; HOT-NEXT:    tail call void @llvm.ubsantrap(i8 22)
-; HOT-NEXT:    unreachable
-; HOT:       4:
-; HOT-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
-; HOT-NEXT:    ret i32 [[TMP5]]
+; HOT99-LABEL: define dso_local noundef i32 @hot(
+; HOT99-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF16:![0-9]+]] {
+; HOT99-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
+; HOT99-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; HOT99-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
+; HOT99-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; HOT99:       3:
+; HOT99-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; HOT99-NEXT:    unreachable
+; HOT99:       4:
+; HOT99-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
+; HOT99-NEXT:    ret i32 [[TMP5]]
 ;
 ; HOT70-LABEL: define dso_local noundef i32 @hot(
 ; HOT70-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF16:![0-9]+]] {
 ; HOT70-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; HOT70-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], false
+; HOT70-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; HOT70-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
 ; HOT70-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; HOT70:       3:
 ; HOT70-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -122,7 +132,8 @@ define dso_local noundef i32 @hot(ptr noundef readonly %0) !prof !36 {
 ; HOT70-NEXT:    ret i32 [[TMP5]]
 ;
   %chk = icmp eq ptr %0, null
-  %hot = call i1 @llvm.experimental.hot()
+  %allow = call i1 @llvm.allow.ubsan.check(i8 22)
+  %hot = xor i1 %allow, true
   %2 = or i1 %chk, %hot
   br i1 %2, label %3, label %4
 
@@ -139,7 +150,8 @@ define dso_local noundef i32 @veryHot(ptr noundef readonly %0) !prof !39 {
 ; NOPROFILE-LABEL: define dso_local noundef i32 @veryHot(
 ; NOPROFILE-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF17:![0-9]+]] {
 ; NOPROFILE-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; NOPROFILE-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], false
+; NOPROFILE-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; NOPROFILE-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
 ; NOPROFILE-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; NOPROFILE:       3:
 ; NOPROFILE-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -151,7 +163,8 @@ define dso_local noundef i32 @veryHot(ptr noundef readonly %0) !prof !39 {
 ; ALL-LABEL: define dso_local noundef i32 @veryHot(
 ; ALL-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF17:![0-9]+]] {
 ; ALL-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; ALL-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], true
+; ALL-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; ALL-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
 ; ALL-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; ALL:       3:
 ; ALL-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -160,22 +173,24 @@ define dso_local noundef i32 @veryHot(ptr noundef readonly %0) !prof !39 {
 ; ALL-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
 ; ALL-NEXT:    ret i32 [[TMP5]]
 ;
-; HOT-LABEL: define dso_local noundef i32 @veryHot(
-; HOT-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF17:![0-9]+]] {
-; HOT-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; HOT-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], true
-; HOT-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
-; HOT:       3:
-; HOT-NEXT:    tail call void @llvm.ubsantrap(i8 22)
-; HOT-NEXT:    unreachable
-; HOT:       4:
-; HOT-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
-; HOT-NEXT:    ret i32 [[TMP5]]
+; HOT99-LABEL: define dso_local noundef i32 @veryHot(
+; HOT99-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF17:![0-9]+]] {
+; HOT99-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
+; HOT99-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; HOT99-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
+; HOT99-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; HOT99:       3:
+; HOT99-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; HOT99-NEXT:    unreachable
+; HOT99:       4:
+; HOT99-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
+; HOT99-NEXT:    ret i32 [[TMP5]]
 ;
 ; HOT70-LABEL: define dso_local noundef i32 @veryHot(
 ; HOT70-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF17:![0-9]+]] {
 ; HOT70-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; HOT70-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], true
+; HOT70-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; HOT70-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
 ; HOT70-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; HOT70:       3:
 ; HOT70-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -185,7 +200,8 @@ define dso_local noundef i32 @veryHot(ptr noundef readonly %0) !prof !39 {
 ; HOT70-NEXT:    ret i32 [[TMP5]]
 ;
   %chk = icmp eq ptr %0, null
-  %hot = call i1 @llvm.experimental.hot()
+  %allow = call i1 @llvm.allow.ubsan.check(i8 22)
+  %hot = xor i1 %allow, true
   %2 = or i1 %chk, %hot
   br i1 %2, label %3, label %4
 
@@ -206,7 +222,8 @@ define dso_local noundef i32 @branchColdFnHot(i32 noundef %0, ptr noundef readon
 ; NOPROFILE-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF18:![0-9]+]]
 ; NOPROFILE:       4:
 ; NOPROFILE-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
-; NOPROFILE-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], false
+; NOPROFILE-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; NOPROFILE-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], [[HOT]]
 ; NOPROFILE-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
 ; NOPROFILE:       6:
 ; NOPROFILE-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -224,7 +241,8 @@ define dso_local noundef i32 @branchColdFnHot(i32 noundef %0, ptr noundef readon
 ; ALL-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF18:![0-9]+]]
 ; ALL:       4:
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
-; ALL-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], true
+; ALL-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; ALL-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], [[HOT]]
 ; ALL-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
 ; ALL:       6:
 ; ALL-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -236,23 +254,24 @@ define dso_local noundef i32 @branchColdFnHot(i32 noundef %0, ptr noundef readon
 ; ALL-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
 ; ALL-NEXT:    ret i32 [[TMP10]]
 ;
-; HOT-LABEL: define dso_local noundef i32 @branchColdFnHot(
-; HOT-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF17]] {
-; HOT-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
-; HOT-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF18:![0-9]+]]
-; HOT:       4:
-; HOT-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
-; HOT-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], false
-; HOT-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
-; HOT:       6:
-; HOT-NEXT:    tail call void @llvm.ubsantrap(i8 22)
-; HOT-NEXT:    unreachable
-; HOT:       7:
-; HOT-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
-; HOT-NEXT:    br label [[TMP9]]
-; HOT:       9:
-; HOT-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
-; HOT-NEXT:    ret i32 [[TMP10]]
+; HOT99-LABEL: define dso_local noundef i32 @branchColdFnHot(
+; HOT99-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF17]] {
+; HOT99-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
+; HOT99-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF18:![0-9]+]]
+; HOT99:       4:
+; HOT99-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
+; HOT99-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; HOT99-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], [[HOT]]
+; HOT99-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; HOT99:       6:
+; HOT99-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; HOT99-NEXT:    unreachable
+; HOT99:       7:
+; HOT99-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
+; HOT99-NEXT:    br label [[TMP9]]
+; HOT99:       9:
+; HOT99-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
+; HOT99-NEXT:    ret i32 [[TMP10]]
 ;
 ; HOT70-LABEL: define dso_local noundef i32 @branchColdFnHot(
 ; HOT70-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF17]] {
@@ -260,7 +279,8 @@ define dso_local noundef i32 @branchColdFnHot(i32 noundef %0, ptr noundef readon
 ; HOT70-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF18:![0-9]+]]
 ; HOT70:       4:
 ; HOT70-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
-; HOT70-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], false
+; HOT70-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; HOT70-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], [[HOT]]
 ; HOT70-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
 ; HOT70:       6:
 ; HOT70-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -277,7 +297,8 @@ define dso_local noundef i32 @branchColdFnHot(i32 noundef %0, ptr noundef readon
 
 4:
   %chk = icmp eq ptr %1, null
-  %hot = call i1 @llvm.experimental.hot()
+  %allow = call i1 @llvm.allow.ubsan.check(i8 22)
+  %hot = xor i1 %allow, true
   %5 = or i1 %chk, %hot
   br i1 %5, label %6, label %7
 
@@ -301,7 +322,8 @@ define dso_local noundef i32 @branchHotFnCold(i32 noundef %0, ptr noundef readon
 ; NOPROFILE-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF19:![0-9]+]]
 ; NOPROFILE:       4:
 ; NOPROFILE-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
-; NOPROFILE-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], false
+; NOPROFILE-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; NOPROFILE-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], [[HOT]]
 ; NOPROFILE-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
 ; NOPROFILE:       6:
 ; NOPROFILE-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -319,7 +341,8 @@ define dso_local noundef i32 @branchHotFnCold(i32 noundef %0, ptr noundef readon
 ; ALL-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF19:![0-9]+]]
 ; ALL:       4:
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
-; ALL-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], true
+; ALL-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; ALL-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], [[HOT]]
 ; ALL-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
 ; ALL:       6:
 ; ALL-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -331,23 +354,24 @@ define dso_local noundef i32 @branchHotFnCold(i32 noundef %0, ptr noundef readon
 ; ALL-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
 ; ALL-NEXT:    ret i32 [[TMP10]]
 ;
-; HOT-LABEL: define dso_local noundef i32 @branchHotFnCold(
-; HOT-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF16]] {
-; HOT-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
-; HOT-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF19:![0-9]+]]
-; HOT:       4:
-; HOT-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
-; HOT-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], true
-; HOT-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
-; HOT:       6:
-; HOT-NEXT:    tail call void @llvm.ubsantrap(i8 22)
-; HOT-NEXT:    unreachable
-; HOT:       7:
-; HOT-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
-; HOT-NEXT:    br label [[TMP9]]
-; HOT:       9:
-; HOT-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
-; HOT-NEXT:    ret i32 [[TMP10]]
+; HOT99-LABEL: define dso_local noundef i32 @branchHotFnCold(
+; HOT99-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF16]] {
+; HOT99-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
+; HOT99-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF19:![0-9]+]]
+; HOT99:       4:
+; HOT99-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
+; HOT99-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; HOT99-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], [[HOT]]
+; HOT99-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; HOT99:       6:
+; HOT99-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; HOT99-NEXT:    unreachable
+; HOT99:       7:
+; HOT99-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
+; HOT99-NEXT:    br label [[TMP9]]
+; HOT99:       9:
+; HOT99-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
+; HOT99-NEXT:    ret i32 [[TMP10]]
 ;
 ; HOT70-LABEL: define dso_local noundef i32 @branchHotFnCold(
 ; HOT70-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF16]] {
@@ -355,7 +379,8 @@ define dso_local noundef i32 @branchHotFnCold(i32 noundef %0, ptr noundef readon
 ; HOT70-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF19:![0-9]+]]
 ; HOT70:       4:
 ; HOT70-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
-; HOT70-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], false
+; HOT70-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; HOT70-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], [[HOT]]
 ; HOT70-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
 ; HOT70:       6:
 ; HOT70-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -372,7 +397,8 @@ define dso_local noundef i32 @branchHotFnCold(i32 noundef %0, ptr noundef readon
 
 4:
   %chk = icmp eq ptr %1, null
-  %hot = call i1 @llvm.experimental.hot()
+  %allow = call i1 @llvm.allow.ubsan.check(i8 22)
+  %hot = xor i1 %allow, true
   %5 = or i1 %chk, %hot
   br i1 %5, label %6, label %7
 
@@ -424,10 +450,10 @@ define dso_local noundef i32 @branchHotFnCold(i32 noundef %0, ptr noundef readon
 ; ALL: [[PROF18]] = !{!"branch_weights", i32 1000, i32 1}
 ; ALL: [[PROF19]] = !{!"branch_weights", i32 1, i32 1000}
 ;.
-; HOT: [[PROF16]] = !{!"function_entry_count", i64 1000}
-; HOT: [[PROF17]] = !{!"function_entry_count", i64 7000}
-; HOT: [[PROF18]] = !{!"branch_weights", i32 1000, i32 1}
-; HOT: [[PROF19]] = !{!"branch_weights", i32 1, i32 1000}
+; HOT99: [[PROF16]] = !{!"function_entry_count", i64 1000}
+; HOT99: [[PROF17]] = !{!"function_entry_count", i64 7000}
+; HOT99: [[PROF18]] = !{!"branch_weights", i32 1000, i32 1}
+; HOT99: [[PROF19]] = !{!"branch_weights", i32 1, i32 1000}
 ;.
 ; HOT70: [[PROF16]] = !{!"function_entry_count", i64 1000}
 ; HOT70: [[PROF17]] = !{!"function_entry_count", i64 7000}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
index 89ea15d..e492596 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
@@ -142,17 +142,16 @@ define void @gather_2(ptr %mat1, float %0, float %1) {
 ; CHECK-LABEL: define void @gather_2(
 ; CHECK-SAME: ptr [[MAT1:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float 0.000000e+00, float 0.000000e+00)
-; CHECK-NEXT:    [[TMP3:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float [[TMP0]], float 0.000000e+00)
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> <float 0.000000e+00, float poison>, float [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP6]], <2 x float> zeroinitializer)
 ; CHECK-NEXT:    [[TMP4:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float 0.000000e+00)
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul float [[TMP2]], 0.000000e+00
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul float [[TMP3]], 0.000000e+00
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul float [[TMP4]], 0.000000e+00
 ; CHECK-NEXT:    [[ARRAYIDX163:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1
-; CHECK-NEXT:    [[ARRAYIDX2_I_I_I278:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1, i64 1
 ; CHECK-NEXT:    [[ARRAYIDX5_I_I_I280:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1, i64 2
-; CHECK-NEXT:    store float [[TMP5]], ptr [[ARRAYIDX163]], align 4
-; CHECK-NEXT:    store float [[TMP6]], ptr [[ARRAYIDX2_I_I_I278]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x float> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    store <2 x float> [[TMP8]], ptr [[ARRAYIDX163]], align 4
 ; CHECK-NEXT:    store float [[TMP7]], ptr [[ARRAYIDX5_I_I_I280]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -358,12 +357,12 @@ define void @reuse_shuffle_indices_cost_crash_2(ptr %bezt, float %0) {
 ; CHECK-NEXT:    [[FNEG:%.*]] = fmul float [[TMP0]], 0.000000e+00
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[FNEG]], float 0.000000e+00)
 ; CHECK-NEXT:    store float [[TMP1]], ptr [[BEZT]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[FNEG]], float 0.000000e+00)
 ; CHECK-NEXT:    [[ARRAYIDX5_I:%.*]] = getelementptr float, ptr [[BEZT]], i64 1
-; CHECK-NEXT:    store float [[TMP2]], ptr [[ARRAYIDX5_I]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call float @llvm.fmuladd.f32(float [[FNEG]], float 0.000000e+00, float 0.000000e+00)
-; CHECK-NEXT:    [[ARRAYIDX8_I831:%.*]] = getelementptr float, ptr [[BEZT]], i64 2
-; CHECK-NEXT:    store float [[TMP3]], ptr [[ARRAYIDX8_I831]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[FNEG]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> [[TMP4]], <2 x float> zeroinitializer)
+; CHECK-NEXT:    store <2 x float> [[TMP5]], ptr [[ARRAYIDX5_I]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/ext-alt-node-must-ext.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/ext-alt-node-must-ext.ll
new file mode 100644
index 0000000..979d0ea
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/ext-alt-node-must-ext.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=systemz-unknown -mcpu=z15 < %s -slp-threshold=-10 | FileCheck %s
+
+define i32 @test(ptr %0, ptr %1) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 32 to ptr), align 32
+; CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 32
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <2 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <2 x i1> [[TMP9]] to <2 x i8>
+; CHECK-NEXT:    [[TMP11:%.*]] = zext <2 x i1> [[TMP9]] to <2 x i8>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i8> [[TMP16]], <2 x i8> [[TMP11]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i8> [[TMP12]], i32 0
+; CHECK-NEXT:    [[DOTNEG:%.*]] = sext i8 [[TMP13]] to i32
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i8> [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = sext i8 [[TMP15]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = add nsw i32 [[DOTNEG]], [[TMP8]]
+; CHECK-NEXT:    ret i32 [[TMP10]]
+;
+  %3 = load i64, ptr inttoptr (i64 32 to ptr), align 32
+  %4 = load ptr, ptr %1, align 8
+  %5 = getelementptr inbounds i8, ptr %4, i64 32
+  %6 = load i64, ptr %5, align 8
+  %7 = icmp ne i64 %3, 0
+  %8 = zext i1 %7 to i32
+  %9 = icmp ne i64 %6, 0
+  %.neg = sext i1 %9 to i32
+  %10 = add nsw i32 %.neg, %8
+  ret i32 %10
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/external-user-instruction-minbitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/external-user-instruction-minbitwidth.ll
new file mode 100644
index 0000000..84f7e21
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/external-user-instruction-minbitwidth.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+@e = global i8 0
+@c = global i16 0
+@d = global i32 0
+
+define i8 @test() {
+; CHECK-LABEL: define i8 @test() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr @e, align 1
+; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr @c, align 2
+; CHECK-NEXT:    [[CONV1:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[CONV]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i32> [[TMP3]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 32769>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> poison, i32 [[CONV1]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <8 x i32> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TMP8]])
+; CHECK-NEXT:    [[CONV4_30:%.*]] = trunc i32 [[TMP11]] to i8
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP4]], i32 7
+; CHECK-NEXT:    [[XOR_31:%.*]] = and i32 [[TMP13]], -2
+; CHECK-NEXT:    store i32 [[XOR_31]], ptr @d, align 4
+; CHECK-NEXT:    ret i8 [[CONV4_30]]
+;
+entry:
+  %0 = load i8, ptr @e, align 1
+  %conv = sext i8 %0 to i32
+  %1 = load i16, ptr @c, align 2
+  %conv1 = zext i16 %1 to i32
+  %or.16 = or i32 %conv, 1
+  %add.16 = add nsw i32 %or.16, %conv1
+  %or.18 = or i32 %conv, 1
+  %add.18 = add nsw i32 %or.18, %conv1
+  %conv4.181 = or i32 %add.16, %add.18
+  %or.20 = or i32 %conv, 1
+  %add.20 = add nsw i32 %or.20, %conv1
+  %conv4.202 = or i32 %conv4.181, %add.20
+  %or.22 = or i32 %conv, 1
+  %add.22 = add nsw i32 %or.22, %conv1
+  %conv4.223 = or i32 %conv4.202, %add.22
+  %or.24 = or i32 %conv, 1
+  %add.24 = add nsw i32 %or.24, %conv1
+  %conv4.244 = or i32 %conv4.223, %add.24
+  %or.26 = or i32 %conv, 1
+  %add.26 = add nsw i32 %or.26, %conv1
+  %conv4.265 = or i32 %conv4.244, %add.26
+  %or.28 = or i32 %conv, 1
+  %add.28 = add nsw i32 %or.28, %conv1
+  %conv4.286 = or i32 %conv4.265, %add.28
+  %or.30 = or i32 %conv, 32769
+  %add.30 = add nsw i32 %or.30, %conv1
+  %conv4.307 = or i32 %conv4.286, %add.30
+  %conv4.30 = trunc i32 %conv4.307 to i8
+  %xor.31 = and i32 %or.30, -2
+  store i32 %xor.31, ptr @d, align 4
+  ret i8 %conv4.30
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
index 66e3fbf..4cc3c12 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
@@ -1295,7 +1295,7 @@ define i8 @umin_intrinsic_rdx_v16i8(ptr %p0) {
 
 define void @PR49730() {
 ; CHECK-LABEL: @PR49730(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 1, i32 1>)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> <i32 2, i32 undef, i32 1, i32 undef>, <4 x i32> <i32 undef, i32 2, i32 undef, i32 1>)
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub nsw <4 x i32> undef, [[TMP1]]
 ; CHECK-NEXT:    [[T12:%.*]] = sub nsw i32 undef, undef
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP2]])
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/icmp-altopcode-after-reordering.ll b/llvm/test/Transforms/SLPVectorizer/X86/icmp-altopcode-after-reordering.ll
new file mode 100644
index 0000000..6b27015
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/icmp-altopcode-after-reordering.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define i32 @test(ptr %sptr, i64 %0) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: ptr [[SPTR:%.*]], i64 [[TMP0:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV_I:%.*]] = trunc i64 [[TMP0]] to i32
+; CHECK-NEXT:    [[IV2:%.*]] = getelementptr i8, ptr [[SPTR]], i64 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[IV2]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[CONV_I]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP1]], <4 x i32> <i32 1, i32 5, i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> <i32 poison, i32 poison, i32 poison, i32 0>, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp sle <4 x i32> [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp slt <4 x i32> [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i1> [[TMP6]], <4 x i1> [[TMP7]], <4 x i32> <i32 4, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i1> [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP10]])
+; CHECK-NEXT:    [[AND33:%.*]] = zext i1 [[TMP11]] to i32
+; CHECK-NEXT:    ret i32 [[AND33]]
+;
+entry:
+  %conv.i = trunc i64 %0 to i32
+  %iv2 = getelementptr i8, ptr %sptr, i64 4
+  %1 = load i32, ptr %iv2, align 4
+  %cmp11 = icmp slt i32 %1, %conv.i
+  %cmp.i57 = icmp eq i32 %1, 0
+  %or.i5977 = or i1 %cmp.i57, %cmp11
+  %iv4 = getelementptr i8, ptr %sptr, i64 12
+  %2 = load i32, ptr %iv4, align 4
+  %cmp16 = icmp sle i32 %2, %conv.i
+  %cmp.i62 = icmp eq i32 %2, 0
+  %or.i6478 = or i1 %cmp.i62, %cmp16
+  %iv3 = getelementptr i8, ptr %sptr, i64 8
+  %3 = load i32, ptr %iv3, align 8
+  %cmp21 = icmp sgt i32 %3, %conv.i
+  %cmp.i67 = icmp eq i32 %3, 0
+  %or.i6979 = or i1 %cmp.i67, %cmp21
+  %iv5 = getelementptr i8, ptr %sptr, i64 16
+  %4 = load i32, ptr %iv5, align 8
+  %cmp26 = icmp slt i32 %conv.i, 0
+  %cmp.i72 = icmp eq i32 %4, 0
+  %or.i7480 = or i1 %cmp.i72, %cmp26
+  %and3183 = and i1 %or.i5977, %or.i6478
+  %and3284 = and i1 %and3183, %or.i6979
+  %and3385 = and i1 %and3284, %or.i7480
+  %and33 = zext i1 %and3385 to i32
+  ret i32 %and33
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll
index fc28d7a..e1fd8a7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll
@@ -19,8 +19,8 @@ define i1 @test(ptr noalias %0, i64 %1, ptr noalias %p, ptr %p1) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <2 x i24> [[TMP8]], <i24 24, i24 24>
 ; CHECK-NEXT:    [[TMP10:%.*]] = select <2 x i1> [[TMP9]], <2 x i24> <i24 23, i24 23>, <2 x i24> [[TMP8]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = trunc <2 x i24> [[TMP10]] to <2 x i8>
-; CHECK-NEXT:    [[TMP11:%.*]] = zext <2 x i8> [[TMP23]] to <2 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = and <2 x i32> [[TMP11]], <i32 254, i32 254>
+; CHECK-NEXT:    [[TMP26:%.*]] = zext <2 x i8> [[TMP23]] to <2 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = and <2 x i32> [[TMP26]], <i32 254, i32 254>
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq <2 x i32> [[TMP12]], <i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP25:%.*]] = select <2 x i1> [[TMP13]], <2 x i8> <i8 2, i8 2>, <2 x i8> [[TMP23]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = zext <2 x i8> [[TMP25]] to <2 x i32>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll
index 136ab64..668d3c3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll
@@ -10,12 +10,14 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP3:%.*]] = select i1 false, i32 0, i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i8> <i8 poison, i8 0, i8 poison, i8 poison>, i8 [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <4 x i8> [[TMP5]] to <4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = trunc <4 x i8> [[TMP5]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i8> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = sext <4 x i8> [[TMP8]] to <4 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i32> zeroinitializer, [[TMP6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq <4 x i32> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc <4 x i8> [[TMP8]] to <4 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i1> zeroinitializer, [[TMP15]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq <4 x i1> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i1> [[TMP15]], <4 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i1> [[TMP16]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 0, i32 poison, i32 0>, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
 ; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP13]])
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
index b5a3c57..acc04be 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
@@ -94,17 +94,13 @@ define i1 @logical_or_fcmp(<4 x float> %x) {
 
 define i1 @logical_and_icmp_diff_preds(<4 x i32> %x) {
 ; SSE-LABEL: @logical_and_icmp_diff_preds(
-; SSE-NEXT:    [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
-; SSE-NEXT:    [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
-; SSE-NEXT:    [[C0:%.*]] = icmp ult i32 [[X0]], 0
-; SSE-NEXT:    [[C2:%.*]] = icmp sgt i32 [[X2]], 0
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <2 x i32> <i32 3, i32 1>
-; SSE-NEXT:    [[TMP2:%.*]] = icmp slt <2 x i32> [[TMP1]], zeroinitializer
-; SSE-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
-; SSE-NEXT:    [[S1:%.*]] = select i1 [[C0]], i1 [[TMP3]], i1 false
-; SSE-NEXT:    [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false
-; SSE-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
-; SSE-NEXT:    [[S3:%.*]] = select i1 [[S2]], i1 [[TMP4]], i1 false
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>, <4 x i32> <i32 1, i32 3, i32 6, i32 0>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> <i32 0, i32 0, i32 poison, i32 0>, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
+; SSE-NEXT:    [[TMP3:%.*]] = icmp slt <4 x i32> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[TMP4:%.*]] = icmp ult <4 x i32> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]]
+; SSE-NEXT:    [[S3:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]])
 ; SSE-NEXT:    ret i1 [[S3]]
 ;
 ; AVX-LABEL: @logical_and_icmp_diff_preds(
@@ -391,17 +387,28 @@ define i1 @logical_and_icmp_clamp_partial(<4 x i32> %x) {
 }
 
 define i1 @logical_and_icmp_clamp_pred_diff(<4 x i32> %x) {
-; CHECK-LABEL: @logical_and_icmp_clamp_pred_diff(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], <i32 42, i32 42, i32 42, i32 42>
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult <4 x i32> [[X]], <i32 42, i32 42, i32 42, i32 42>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt <4 x i32> [[X]], <i32 17, i32 17, i32 17, i32 17>
-; CHECK-NEXT:    [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]])
-; CHECK-NEXT:    [[TMP7:%.*]] = freeze <4 x i1> [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP7]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP6]], i1 [[TMP8]], i1 false
-; CHECK-NEXT:    ret i1 [[OP_RDX]]
+; SSE-LABEL: @logical_and_icmp_clamp_pred_diff(
+; SSE-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], <i32 42, i32 42, i32 42, i32 42>
+; SSE-NEXT:    [[TMP2:%.*]] = icmp ult <4 x i32> [[X]], <i32 42, i32 42, i32 42, i32 42>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; SSE-NEXT:    [[TMP4:%.*]] = icmp sgt <4 x i32> [[X]], <i32 17, i32 17, i32 17, i32 17>
+; SSE-NEXT:    [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
+; SSE-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]])
+; SSE-NEXT:    [[TMP7:%.*]] = freeze <4 x i1> [[TMP3]]
+; SSE-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP7]])
+; SSE-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP6]], i1 [[TMP8]], i1 false
+; SSE-NEXT:    ret i1 [[OP_RDX]]
+;
+; AVX-LABEL: @logical_and_icmp_clamp_pred_diff(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 0, i32 2, i32 3>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 42, i32 42, i32 42, i32 poison>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 3>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> <i32 17, i32 17, i32 17, i32 17, i32 poison, i32 poison, i32 poison, i32 42>, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; AVX-NEXT:    [[TMP4:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[TMP3]]
+; AVX-NEXT:    [[TMP5:%.*]] = icmp ult <8 x i32> [[TMP2]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+; AVX-NEXT:    [[TMP7:%.*]] = freeze <8 x i1> [[TMP6]]
+; AVX-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP7]])
+; AVX-NEXT:    ret i1 [[TMP8]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0
   %x1 = extractelement <4 x i32> %x, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
index fb2b653..82085ad 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
@@ -12,10 +12,10 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x float> zeroinitializer, [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX10_I_I86]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr undef, align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> <float poison, float 0.000000e+00>, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> poison, <2 x i32> <i32 1, i32 poison>
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> <float 0.000000e+00, float poison>, <2 x i32> <i32 2, i32 0>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> <float 0.000000e+00, float poison>, <2 x i32> <i32 2, i32 0>
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP6]], <2 x float> [[TMP7]])
 ; CHECK-NEXT:    br i1 false, label [[BB2:%.*]], label [[BB3:%.*]]
 ; CHECK:       bb2:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
index 46cca9b..1faeea7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
@@ -142,8 +142,8 @@ define void @gather_2(ptr %mat1, float %0, float %1) {
 ; CHECK-SAME: ptr [[MAT1:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> <float 0.000000e+00, float poison>, <2 x i32> <i32 2, i32 0>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> <float 0.000000e+00, float poison>, float [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x float> zeroinitializer)
 ; CHECK-NEXT:    [[TMP6:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float 0.000000e+00)
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul float [[TMP6]], 0.000000e+00
diff --git a/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll b/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll
index 66229c2..8b131cc 100644
--- a/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll
+++ b/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll
@@ -43,7 +43,7 @@ declare i32 @llvm.umin.i32(i32, i32)
 define void @test2() {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>)
+; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> <i32 2, i32 undef, i32 1, i32 undef>, <4 x i32> <i32 undef, i32 3, i32 undef, i32 0>)
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub nsw <4 x i32> undef, [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP1]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP2]], i32 77)
diff --git a/llvm/test/Transforms/SampleProfile/Inputs/non-probe-stale-profile-matching.prof b/llvm/test/Transforms/SampleProfile/Inputs/non-probe-stale-profile-matching.prof
new file mode 100644
index 0000000..8e98851
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/Inputs/non-probe-stale-profile-matching.prof
@@ -0,0 +1,23 @@
+main:9229397:0
+ 0: 0
+ 1: 0
+ 1.1: 47663
+ 1.2: 51871
+ 2: 48723
+ 3: 48723 bar:49018
+ 4: 49087
+ 5: 51871 bar:49588
+ 7: 0
+ 2: foo:1479916
+  1: 47663
+  1.1: 46683 bar:43238
+  2: 4519 bar:4932
+  3: 48723
+ 4: foo:1505537
+  1: 48604
+  1.1: 46965 bar:44479
+  2: 4613 bar:4967
+  3: 49087
+bar:2333388:196222
+ 0: 194449
+ 1: 194449
diff --git a/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-profile.prof b/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-profile.prof
index ba4c611..d384794 100644
--- a/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-profile.prof
+++ b/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-profile.prof
@@ -1,8 +1,8 @@
 foo:3200:13
  1: 13
  2: 7
- 3: 6
- 4: 13
- 5: 7 _Z3barv:2 _Z3foov:5
- 6: 6 _Z3barv:4 _Z3foov:2
+ 4: 6
+ 6: 13
+ 3: 7 _Z3barv:2 _Z3foov:5
+ 5: 6 _Z3barv:4 _Z3foov:2
  !CFGChecksum: 563022570642068
diff --git a/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-update.prof b/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-update.prof
index 62f9bd5..213bf0b 100644
--- a/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-update.prof
+++ b/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-update.prof
@@ -1,8 +1,8 @@
 foo:3200:13
  1: 13
  2: 7
- 3: 6
- 4: 13
- 5: 7
- 6: 6
+ 4: 6
+ 6: 13
+ 7: 7
+ 9: 6
  !CFGChecksum: 844530426352218
diff --git a/llvm/test/Transforms/SampleProfile/non-probe-stale-profile-matching.ll b/llvm/test/Transforms/SampleProfile/non-probe-stale-profile-matching.ll
new file mode 100644
index 0000000..eb69c18a
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/non-probe-stale-profile-matching.ll
@@ -0,0 +1,229 @@
+; REQUIRES: x86_64-linux
+; REQUIRES: asserts
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/non-probe-stale-profile-matching.prof --salvage-stale-profile -S --debug-only=sample-profile,sample-profile-matcher,sample-profile-impl 2>&1 | FileCheck %s
+
+; The profiled source code:
+
+;  volatile int x = 1;
+;  __attribute__((noinline)) int bar(int p) {
+;    return p;
+;  }
+
+;  __attribute__((always_inline)) int foo(int i, int p) {
+;    if (i % 10) return  bar(p);
+;    else return bar(p + 1);
+;  }
+
+;  int main() {
+;    for (int i = 0; i < 1000 * 1000; i++) {
+;       x += foo(i, x);
+;       x += bar(x);
+;       x += foo(i, x);
+;       x += bar(x);
+;    }
+;  }
+
+; The source code for the current build:
+
+;  volatile int x = 1;
+;  __attribute__((noinline)) int bar(int p) {
+;    return p;
+;  }
+
+;  __attribute__((always_inline)) int foo(int i, int p) {
+;    if (i % 10) return  bar(p);
+;    else return bar(p + 1);
+;  }
+
+;  int main() {
+;    if (x == 0)          // code change
+;      return 0;          // code change
+;    for (int i = 0; i < 1000 * 1000; i++) {
+;       x += foo(i, x);
+;       x += bar(x);
+;       if (i < 0)        // code change
+;         return 0;       // code change
+;       x += foo(i, x);
+;       x += bar(x);
+;    }
+;  }
+
+; CHECK: Run stale profile matching for bar
+
+; CHECK: Run stale profile matching for foo
+; CHECK: Callsite with callee:bar is matched from 1.1 to 1.1
+; CHECK: Callsite with callee:bar is matched from 2 to 2
+
+; CHECK: Run stale profile matching for main
+; CHECK: Callsite with callee:foo is matched from 4 to 2
+; CHECK: Callsite with callee:bar is matched from 5 to 3
+; CHECK: Callsite with callee:foo is matched from 8 to 4
+; CHECK: Callsite with callee:bar is matched from 9 to 5
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@x = dso_local global i32 1, align 4
+
+; Function Attrs: noinline nounwind uwtable
+define dso_local i32 @bar(i32 noundef %p) #0 !dbg !9 {
+entry:
+  ret i32 %p, !dbg !13
+}
+
+; Function Attrs: alwaysinline nounwind uwtable
+define dso_local i32 @foo(i32 noundef %i, i32 noundef %p) #1 !dbg !14 {
+entry:
+  %rem = srem i32 %i, 10, !dbg !15
+  %tobool = icmp ne i32 %rem, 0, !dbg !15
+  br i1 %tobool, label %if.then, label %if.else, !dbg !16
+
+if.then:                                          ; preds = %entry
+  %call = call i32 @bar(i32 noundef %p), !dbg !17
+  br label %return, !dbg !19
+
+if.else:                                          ; preds = %entry
+  %add = add nsw i32 %p, 1, !dbg !20
+  %call1 = call i32 @bar(i32 noundef %add), !dbg !21
+  br label %return, !dbg !22
+
+return:                                           ; preds = %if.else, %if.then
+  %retval.0 = phi i32 [ %call, %if.then ], [ %call1, %if.else ], !dbg !23
+  ret i32 %retval.0, !dbg !24
+}
+
+; Function Attrs: nounwind uwtable
+define dso_local i32 @main() #2 !dbg !25 {
+entry:
+  %0 = load volatile i32, ptr @x, align 4, !dbg !26, !tbaa !27
+  %cmp = icmp eq i32 %0, 0, !dbg !31
+  br i1 %cmp, label %if.then, label %if.end, !dbg !26
+
+if.then:                                          ; preds = %entry
+  br label %for.end, !dbg !32
+
+if.end:                                           ; preds = %entry
+  br label %for.cond, !dbg !33
+
+for.cond:                                         ; preds = %if.end6, %if.end
+  %i.0 = phi i32 [ 0, %if.end ], [ %inc, %if.end6 ], !dbg !34
+  %cmp1 = icmp slt i32 %i.0, 1000000, !dbg !35
+  br i1 %cmp1, label %for.body, label %for.cond.cleanup, !dbg !37
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  br label %cleanup, !dbg !38
+
+for.body:                                         ; preds = %for.cond
+  %1 = load volatile i32, ptr @x, align 4, !dbg !40, !tbaa !27
+  %call = call i32 @foo(i32 noundef %i.0, i32 noundef %1), !dbg !41
+  %2 = load volatile i32, ptr @x, align 4, !dbg !42, !tbaa !27
+  %add = add nsw i32 %2, %call, !dbg !42
+  store volatile i32 %add, ptr @x, align 4, !dbg !42, !tbaa !27
+  %3 = load volatile i32, ptr @x, align 4, !dbg !43, !tbaa !27
+  %call2 = call i32 @bar(i32 noundef %3), !dbg !44
+  %4 = load volatile i32, ptr @x, align 4, !dbg !45, !tbaa !27
+  %add3 = add nsw i32 %4, %call2, !dbg !45
+  store volatile i32 %add3, ptr @x, align 4, !dbg !45, !tbaa !27
+  br i1 false, label %if.then5, label %if.end6, !dbg !46
+
+if.then5:                                         ; preds = %for.body
+  br label %cleanup, !dbg !47
+
+if.end6:                                          ; preds = %for.body
+  %5 = load volatile i32, ptr @x, align 4, !dbg !48, !tbaa !27
+  %call7 = call i32 @foo(i32 noundef %i.0, i32 noundef %5), !dbg !49
+  %6 = load volatile i32, ptr @x, align 4, !dbg !50, !tbaa !27
+  %add8 = add nsw i32 %6, %call7, !dbg !50
+  store volatile i32 %add8, ptr @x, align 4, !dbg !50, !tbaa !27
+  %7 = load volatile i32, ptr @x, align 4, !dbg !51, !tbaa !27
+  %call9 = call i32 @bar(i32 noundef %7), !dbg !52
+  %8 = load volatile i32, ptr @x, align 4, !dbg !53, !tbaa !27
+  %add10 = add nsw i32 %8, %call9, !dbg !53
+  store volatile i32 %add10, ptr @x, align 4, !dbg !53, !tbaa !27
+  %inc = add nsw i32 %i.0, 1, !dbg !54
+  br label %for.cond, !dbg !56, !llvm.loop !57
+
+cleanup:                                          ; preds = %if.then5, %for.cond.cleanup
+  br label %for.end
+
+for.end:                                          ; preds = %cleanup, %if.then
+  ret i32 0, !dbg !61
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #3
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #3
+
+attributes #0 = { noinline nounwind uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-sample-profile" }
+attributes #1 = { alwaysinline nounwind uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-sample-profile" }
+attributes #2 = { nounwind uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-sample-profile" }
+attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6, !7}
+!llvm.ident = !{!8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 19.0.0git", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
+!1 = !DIFile(filename: "test.c", directory: "path")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 8, !"PIC Level", i32 2}
+!6 = !{i32 7, !"PIE Level", i32 2}
+!7 = !{i32 7, !"uwtable", i32 2}
+!8 = !{!"clang version 19.0.0git"}
+!9 = distinct !DISubprogram(name: "bar", scope: !10, file: !10, line: 2, type: !11, scopeLine: 2, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!10 = !DIFile(filename: "test.c", directory: "path")
+!11 = !DISubroutineType(types: !12)
+!12 = !{}
+!13 = !DILocation(line: 3, column: 3, scope: !9)
+!14 = distinct !DISubprogram(name: "foo", scope: !10, file: !10, line: 6, type: !11, scopeLine: 6, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!15 = !DILocation(line: 7, column: 9, scope: !14)
+!16 = !DILocation(line: 7, column: 7, scope: !14)
+!17 = !DILocation(line: 7, column: 23, scope: !18)
+!18 = !DILexicalBlockFile(scope: !14, file: !10, discriminator: 2)
+!19 = !DILocation(line: 7, column: 15, scope: !18)
+!20 = !DILocation(line: 8, column: 21, scope: !14)
+!21 = !DILocation(line: 8, column: 15, scope: !14)
+!22 = !DILocation(line: 8, column: 8, scope: !14)
+!23 = !DILocation(line: 0, scope: !14)
+!24 = !DILocation(line: 9, column: 1, scope: !14)
+!25 = distinct !DISubprogram(name: "main", scope: !10, file: !10, line: 11, type: !11, scopeLine: 11, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!26 = !DILocation(line: 12, column: 7, scope: !25)
+!27 = !{!28, !28, i64 0}
+!28 = !{!"int", !29, i64 0}
+!29 = !{!"omnipotent char", !30, i64 0}
+!30 = !{!"Simple C/C++ TBAA"}
+!31 = !DILocation(line: 12, column: 9, scope: !25)
+!32 = !DILocation(line: 13, column: 5, scope: !25)
+!33 = !DILocation(line: 14, column: 8, scope: !25)
+!34 = !DILocation(line: 14, scope: !25)
+!35 = !DILocation(line: 14, column: 21, scope: !36)
+!36 = !DILexicalBlockFile(scope: !25, file: !10, discriminator: 2)
+!37 = !DILocation(line: 14, column: 3, scope: !36)
+!38 = !DILocation(line: 14, column: 3, scope: !39)
+!39 = !DILexicalBlockFile(scope: !25, file: !10, discriminator: 4)
+!40 = !DILocation(line: 15, column: 18, scope: !25)
+!41 = !DILocation(line: 15, column: 11, scope: !25)
+!42 = !DILocation(line: 15, column: 8, scope: !25)
+!43 = !DILocation(line: 16, column: 15, scope: !25)
+!44 = !DILocation(line: 16, column: 11, scope: !25)
+!45 = !DILocation(line: 16, column: 8, scope: !25)
+!46 = !DILocation(line: 17, column: 10, scope: !25)
+!47 = !DILocation(line: 18, column: 8, scope: !25)
+!48 = !DILocation(line: 19, column: 18, scope: !25)
+!49 = !DILocation(line: 19, column: 11, scope: !25)
+!50 = !DILocation(line: 19, column: 8, scope: !25)
+!51 = !DILocation(line: 20, column: 15, scope: !25)
+!52 = !DILocation(line: 20, column: 11, scope: !25)
+!53 = !DILocation(line: 20, column: 8, scope: !25)
+!54 = !DILocation(line: 14, column: 37, scope: !55)
+!55 = !DILexicalBlockFile(scope: !25, file: !10, discriminator: 6)
+!56 = !DILocation(line: 14, column: 3, scope: !55)
+!57 = distinct !{!57, !58, !59, !60}
+!58 = !DILocation(line: 14, column: 3, scope: !25)
+!59 = !DILocation(line: 21, column: 3, scope: !25)
+!60 = !{!"llvm.loop.mustprogress"}
+!61 = !DILocation(line: 22, column: 1, scope: !25)
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-callee-profile-mismatch.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-callee-profile-mismatch.ll
index 4881937..43be142 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-callee-profile-mismatch.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-callee-profile-mismatch.ll
@@ -1,7 +1,9 @@
 ; REQUIRES: x86_64-linux
 ; REQUIRES: asserts
-; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-callee-profile-mismatch.prof --salvage-stale-profile -S --debug-only=sample-profile,sample-profile-matcher,sample-profile-impl  -pass-remarks=inline 2>&1 | FileCheck %s
+; RUN: opt < %s -passes='thinlto<O2>' -pgo-kind=pgo-sample-use-pipeline  -sample-profile-file=%S/Inputs/pseudo-probe-callee-profile-mismatch.prof --salvage-stale-profile -S --debug-only=sample-profile,sample-profile-matcher,sample-profile-impl  -pass-remarks=inline 2>&1 | FileCheck %s
 
+; There is no profile-checksum-mismatch attr, even the checksum is mismatched in the pseudo_probe_desc, it doesn't run the matching.
+; CHECK-NOT: Run stale profile matching for main
 
 ; CHECK: Run stale profile matching for bar
 ; CHECK: Callsite with callee:baz is matched from 4 to 2
@@ -14,7 +16,7 @@
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-define i32 @main() #0 {
+define available_externally i32 @main() #0 {
   %1 = call i32 @bar(), !dbg !13
   ret i32 0
 }
@@ -47,7 +49,8 @@ attributes #1 = { "profile-checksum-mismatch" "use-sample-profile" }
 !9 = distinct !DICompileUnit(language: DW_LANG_C11, file: !10, producer: "clang version 19.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
 !10 = !DIFile(filename: "test2.c", directory: "/home/test", checksumkind: CSK_MD5, checksum: "553093afc026f9c73562eb3b0c5b7532")
 !11 = !{i32 2, !"Debug Info Version", i32 3}
-!12 = !{i64 -2624081020897602054, i64 281582081721716, !"main"}
+; Make a checksum mismatch in the pseudo_probe_desc
+!12 = !{i64 -2624081020897602054, i64 123456, !"main"}
 !13 = !DILocation(line: 8, column: 10, scope: !14)
 !14 = !DILexicalBlockFile(scope: !15, file: !1, discriminator: 186646591)
 !15 = distinct !DILexicalBlock(scope: !16, file: !1, line: 7, column: 40)
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-dangle.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-dangle.ll
index 4647a34f..f0b6fdf 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-dangle.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-dangle.ll
@@ -23,21 +23,21 @@ Merge:
 ; JT-LABEL-NO: T
 ; JT-LABEL-NO: F
 ; JT-LABEL: Merge
+; JT-NOT: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4
 ; JT-NOT: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 3
-; JT-NOT: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 2
-; JT: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -1)
+; JT: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 5, i32 0, i64 -1)
+; ASM-NOT: .pseudoprobe	6699318081062747564 4
 ; ASM-NOT: .pseudoprobe	6699318081062747564 3
-; ASM-NOT: .pseudoprobe	6699318081062747564 2
-; ASM: .pseudoprobe	6699318081062747564 4 0 0
+; ASM: .pseudoprobe	6699318081062747564 5 0 0
 	ret i32 %call
 }
 
 ;; Check block T and F are gone, and their probes (probe 2 and 3) are gone too.
 ; MIR-tail: bb.0
 ; MIR-tail: PSEUDO_PROBE [[#GUID:]], 1, 0, 0
-; MIR-tail-NOT: PSEUDO_PROBE [[#GUID:]], 2
 ; MIR-tail-NOT: PSEUDO_PROBE [[#GUID:]], 3
-; MIR-tail: PSEUDO_PROBE [[#GUID:]], 4, 0, 0
+; MIR-tail-NOT: PSEUDO_PROBE [[#GUID:]], 4
+; MIR-tail: PSEUDO_PROBE [[#GUID:]], 5, 0, 0
 
 
 define i32 @test(i32 %a, i32 %b, i32 %c) {
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-discriminator.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-discriminator.ll
index 62f0737..97b0ed6 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-discriminator.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-discriminator.ll
@@ -62,10 +62,10 @@ attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "fra
 ; DEBUG: ![[INST]] = !DILocation(line: 4, column: 15, scope: ![[INSTBLOCK:[0-9]+]])
 ; DEBUG: ![[INSTBLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 4)
 
-           
+
 ; PROBE: ![[CALL1]] = !DILocation(line: 4, column: 3, scope: ![[CALL1BLOCK:[0-9]+]])
-; PROBE: ![[CALL1BLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 186646575)
+; PROBE: ![[CALL1BLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 186646559)
 ; PROBE: ![[CALL2]] = !DILocation(line: 4, column: 9, scope: ![[CALL2BLOCK:[0-9]+]])
-; PROBE: ![[CALL2BLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 186646583)
+; PROBE: ![[CALL2BLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 186646567)
 ; PROBE: ![[INST]] = !DILocation(line: 4, column: 15, scope: ![[INSTBLOCK:[0-9]+]])
 ; PROBE: ![[INSTBLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 4)
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-invoke.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-invoke.ll
index 822ab40..03bb64b 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-invoke.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-invoke.ll
@@ -18,10 +18,12 @@ entry:
 
 if.then:                                          ; preds = %entry
 ; CHECK: call void @llvm.pseudoprobe(i64 -1069303473483922844, i64 2
+; callsite probe 3
   invoke void @_Z3foov()
           to label %invoke.cont unwind label %terminate.lpad, !dbg !24
 
 invoke.cont:                                      ; preds = %if.then
+; callsite probe 4
 ; CHECK-NOT: call void @llvm.pseudoprobe(i64 -1069303473483922844,
   invoke void @_Z3bazv()
           to label %invoke.cont1 unwind label %terminate.lpad, !dbg !26
@@ -31,7 +33,8 @@ invoke.cont1:                                     ; preds = %invoke.cont
   br label %if.end, !dbg !27
 
 if.else:                                          ; preds = %entry
-; CHECK: call void @llvm.pseudoprobe(i64 -1069303473483922844, i64 3
+; CHECK: call void @llvm.pseudoprobe(i64 -1069303473483922844, i64 5
+; callsite probe 6
   invoke void @_Z3foov()
           to label %invoke.cont2 unwind label %terminate.lpad, !dbg !28
 
@@ -40,7 +43,8 @@ invoke.cont2:                                     ; preds = %if.else
   br label %if.end
 
 if.end:                                           ; preds = %invoke.cont2, %invoke.cont1
-; CHECK: call void @llvm.pseudoprobe(i64 -1069303473483922844, i64 4
+; CHECK: call void @llvm.pseudoprobe(i64 -1069303473483922844, i64 7
+; callsite probe 8
   invoke void @_Z3foov()
           to label %invoke.cont3 unwind label %terminate.lpad, !dbg !29
 
@@ -51,14 +55,14 @@ invoke.cont3:                                     ; preds = %if.end
   br i1 %tobool4, label %if.then5, label %if.end6, !dbg !32
 
 if.then5:                                         ; preds = %invoke.cont3
-; CHECK: call void @llvm.pseudoprobe(i64 -1069303473483922844, i64 5
+; CHECK: call void @llvm.pseudoprobe(i64 -1069303473483922844, i64 9
   %2 = load volatile i32, ptr @x, align 4, !dbg !33, !tbaa !19
   %inc = add nsw i32 %2, 1, !dbg !33
   store volatile i32 %inc, ptr @x, align 4, !dbg !33, !tbaa !19
   br label %if.end6, !dbg !35
 
 if.end6:                                          ; preds = %if.then5, %invoke.cont3
-; CHECK: call void @llvm.pseudoprobe(i64 -1069303473483922844, i64 6
+; CHECK: call void @llvm.pseudoprobe(i64 -1069303473483922844, i64 10
   ret void, !dbg !36
 
 terminate.lpad:                                   ; preds = %if.end, %if.else, %invoke.cont, %if.then
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-profile-metadata-2.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-profile-metadata-2.ll
index 148f3ed..379dcfc 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-profile-metadata-2.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-profile-metadata-2.ll
@@ -29,7 +29,7 @@ if.else:
   br label %return
 
 return:
-  call void @llvm.pseudoprobe(i64 6699318081062747564, i64 4, i32 0, i64 -1)
+  call void @llvm.pseudoprobe(i64 6699318081062747564, i64 6, i32 0, i64 -1)
   %1 = load i32, ptr %retval, align 4
   ret i32 %1
 }
@@ -55,13 +55,12 @@ attributes #0 = {"use-sample-profile"}
 !9 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !5, isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
 !10 = !{!"function_entry_count", i64 14}
 !11 = !{!"branch_weights", i32 100, i32 0}
-;; A discriminator of 186646575 which is 0x6f80057 in hexdecimal, stands for an indirect call probe
-;; with an index of 5 and probe factor of 1.0.
-!12 = !DILexicalBlockFile(scope: !4, file: !5, discriminator: 186646575)
+;; A discriminator of 186646559 which is 0xB20001F in hexdecimal, stands for an indirect call probe
+;; with an index of 3 and probe factor of 1.0.
+!12 = !DILexicalBlockFile(scope: !4, file: !5, discriminator: 186646559)
 !13 = distinct !DILocation(line: 10, column: 11, scope: !12)
-;; A discriminator of 134217775 which is 0x6f80057 in hexdecimal, stands for an indirect call probe
-;; with an index of 5 and probe factor of 0.
-!14 = !DILexicalBlockFile(scope: !4, file: !5, discriminator: 134217775)
+;; A discriminator of 134217759 which is 0x800001F in hexdecimal, stands for an indirect call probe
+;; with an index of 3 and probe factor of 0.
+!14 = !DILexicalBlockFile(scope: !4, file: !5, discriminator: 134217759)
 !15 = distinct !DILocation(line: 10, column: 11, scope: !14)
 !16 = !{!"VP", i32 0, i64 7, i64 9191153033785521275, i64 5, i64 -1069303473483922844, i64 2}
-
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-profile.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-profile.ll
index 474b666..867a49d 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-profile.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-profile.ll
@@ -22,12 +22,12 @@ if.then:
 if.else:
   ; CHECK: call {{.*}}, !dbg ![[#PROBE2:]], !prof ![[PROF2:[0-9]+]]
   call void %f(i32 2)
-  ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 3, i32 0, i64 -1)
+  ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -1)
   store i32 2, ptr %retval, align 4
   br label %return
 
 return:
-  ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -1)
+  ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 6, i32 0, i64 -1)
   %1 = load i32, ptr %retval, align 4
   ret i32 %1
 }
@@ -36,14 +36,14 @@ attributes #0 = {"use-sample-profile"}
 
 ; CHECK: ![[PD1]] = !{!"branch_weights", i32 8, i32 7}
 ; CHECK: ![[#PROBE1]] = !DILocation(line: 0, scope: ![[#SCOPE1:]])
+;; A discriminator of 119537695 which is 0x720001f in hexdecimal, stands for an indirect call probe
+;; with an index of 3.
+; CHECK: ![[#SCOPE1]] = !DILexicalBlockFile(scope: ![[#]], file: ![[#]], discriminator: 119537695)
+; CHECK: ![[PROF1]] = !{!"VP", i32 0, i64 7, i64 9191153033785521275, i64 5, i64 -1069303473483922844, i64 2}
 ;; A discriminator of 119537711 which is 0x720002f in hexdecimal, stands for an indirect call probe
 ;; with an index of 5.
-; CHECK: ![[#SCOPE1]] = !DILexicalBlockFile(scope: ![[#]], file: ![[#]], discriminator: 119537711)
-; CHECK: ![[PROF1]] = !{!"VP", i32 0, i64 7, i64 9191153033785521275, i64 5, i64 -1069303473483922844, i64 2}
-;; A discriminator of 119537719 which is 0x7200037 in hexdecimal, stands for an indirect call probe
-;; with an index of 6.
 ; CHECK: ![[#PROBE2]] = !DILocation(line: 0, scope: ![[#SCOPE2:]])
-; CHECK: ![[#SCOPE2]] = !DILexicalBlockFile(scope: ![[#]], file: ![[#]], discriminator: 119537719)
+; CHECK: ![[#SCOPE2]] = !DILexicalBlockFile(scope: ![[#]], file: ![[#]], discriminator: 119537711)
 ; CHECK: ![[PROF2]] = !{!"VP", i32 0, i64 6, i64 -1069303473483922844, i64 4, i64 9191153033785521275, i64 2}
 
 !llvm.module.flags = !{!9, !10}
@@ -83,7 +83,7 @@ attributes #0 = {"use-sample-profile"}
 ;YAML-NEXT:    - String:          'Applied '
 ;YAML-NEXT:    - NumSamples:      '7'
 ;YAML-NEXT:    - String:          ' samples from profile (ProbeId='
-;YAML-NEXT:    - ProbeId:         '5'
+;YAML-NEXT:    - ProbeId:         '3'
 ;YAML-NEXT:    - String:          ', Factor='
 ;YAML-NEXT:    - Factor:          '1.000000e+00'
 ;YAML-NEXT:    - String:          ', OriginalSamples='
@@ -113,7 +113,7 @@ attributes #0 = {"use-sample-profile"}
 ;YAML-NEXT:    - String:          'Applied '
 ;YAML-NEXT:    - NumSamples:      '6'
 ;YAML-NEXT:    - String:          ' samples from profile (ProbeId='
-;YAML-NEXT:    - ProbeId:         '6'
+;YAML-NEXT:    - ProbeId:         '5'
 ;YAML-NEXT:    - String:          ', Factor='
 ;YAML-NEXT:    - Factor:          '1.000000e+00'
 ;YAML-NEXT:    - String:          ', OriginalSamples='
@@ -128,7 +128,7 @@ attributes #0 = {"use-sample-profile"}
 ;YAML-NEXT:    - String:          'Applied '
 ;YAML-NEXT:    - NumSamples:      '6'
 ;YAML-NEXT:    - String:          ' samples from profile (ProbeId='
-;YAML-NEXT:    - ProbeId:         '3'
+;YAML-NEXT:    - ProbeId:         '4'
 ;YAML-NEXT:    - String:          ', Factor='
 ;YAML-NEXT:    - Factor:          '1.000000e+00'
 ;YAML-NEXT:    - String:          ', OriginalSamples='
@@ -143,7 +143,7 @@ attributes #0 = {"use-sample-profile"}
 ;YAML-NEXT:    - String:          'Applied '
 ;YAML-NEXT:    - NumSamples:      '13'
 ;YAML-NEXT:    - String:          ' samples from profile (ProbeId='
-;YAML-NEXT:    - ProbeId:         '4'
+;YAML-NEXT:    - ProbeId:         '6'
 ;YAML-NEXT:    - String:          ', Factor='
 ;YAML-NEXT:    - Factor:          '1.000000e+00'
 ;YAML-NEXT:    - String:          ', OriginalSamples='
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-update.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-update.ll
index 992afed..217b619 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-update.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-update.ll
@@ -14,15 +14,15 @@ T1:
 	%v1 = call i32 @f1()
 ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 2, i32 0, i64 -1)
 ;; The distribution factor -8513881372706734080 stands for 53.85%, whic is from 7/6+7.
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -8513881372706734080)
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 6, i32 0, i64 -8513881372706734080)
     %cond3 = icmp eq i32 %v1, 412
 	br label %Merge
 F1:
 ; CHECK: %v2 = call i32 @f2(), !prof ![[#PROF2:]]
 	%v2 = call i32 @f2()
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 3, i32 0, i64 -1)
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -1)
 ;; The distribution factor 8513881922462547968 stands for 46.25%, which is from 6/6+7.
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 8513881922462547968)
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 6, i32 0, i64 8513881922462547968)
 	br label %Merge
 Merge:
 
@@ -30,11 +30,11 @@ Merge:
 	%B = phi i32 [%v1, %T1], [%v2, %F1]
 	br i1 %A, label %T2, label %F2
 T2:
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 5, i32 0, i64 -1)
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 7, i32 0, i64 -1)
 	call void @f3()
 	ret i32 %B
 F2:
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 6, i32 0, i64 -1)
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 9, i32 0, i64 -1)
 	ret i32 %B
 }
 
@@ -42,4 +42,3 @@ F2:
 ; CHECK: ![[#PROF2]] = !{!"branch_weights", i32 6}
 
 attributes #0 = {"use-sample-profile"}
-
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-verify.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-verify.ll
index f70e518..b622cfb 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-verify.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-verify.ll
@@ -4,7 +4,7 @@
 
 ; VERIFY: *** Pseudo Probe Verification After LoopFullUnrollPass ***
 ; VERIFY: Function foo:
-; VERIFY-DAG: Probe 6	previous factor 1.00	current factor 5.00
+; VERIFY-DAG: Probe 5	previous factor 1.00	current factor 5.00
 ; VERIFY-DAG: Probe 4	previous factor 1.00	current factor 5.00
 
 declare void @foo2() nounwind
@@ -27,15 +27,15 @@ bb7.preheader:
 
 bb10:
 ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -1)
-; CHECK: call void @foo2(), !dbg ![[#PROBE6:]] 
+; CHECK: call void @foo2(), !dbg ![[#PROBE6:]]
 ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -1)
-; CHECK: call void @foo2(), !dbg ![[#PROBE6:]] 
+; CHECK: call void @foo2(), !dbg ![[#PROBE6:]]
 ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -1)
-; CHECK: call void @foo2(), !dbg ![[#PROBE6:]] 
+; CHECK: call void @foo2(), !dbg ![[#PROBE6:]]
 ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -1)
-; CHECK: call void @foo2(), !dbg ![[#PROBE6:]] 
+; CHECK: call void @foo2(), !dbg ![[#PROBE6:]]
 ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -1)
-; CHECK: call void @foo2(), !dbg ![[#PROBE6:]] 
+; CHECK: call void @foo2(), !dbg ![[#PROBE6:]]
 ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 2, i32 0, i64 -1)
   %indvars.iv = phi i64 [ 0, %bb7.preheader ], [ %indvars.iv.next, %bb10 ]
   %tmp1.14 = phi i32 [ %tmp1.06, %bb7.preheader ], [ %spec.select, %bb10 ]
@@ -50,14 +50,14 @@ bb10:
   br i1 %exitcond.not, label %bb3.loopexit, label %bb10, !llvm.loop !13
 
 bb24:
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 5, i32 0, i64 -1)
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 6, i32 0, i64 -1)
   ret void
 }
 
 ;; A discriminator of 186646583 which is 0xb200037 in hexdecimal, stands for a direct call probe
 ;; with an index of 6 and a scale of -1%.
 ; CHECK: ![[#PROBE6]] = !DILocation(line: 2, column: 20, scope: ![[#SCOPE:]])
-; CHECK: ![[#SCOPE]] = !DILexicalBlockFile(scope: ![[#]], file: ![[#]], discriminator: 186646583)
+; CHECK: ![[#SCOPE]] = !DILexicalBlockFile(scope: ![[#]], file: ![[#]], discriminator: 186646575)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10}
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll
new file mode 100644
index 0000000..2031c2d
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll
@@ -0,0 +1,228 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX
+
+; standard vector concatenations
+
+define <16 x i32> @concat_zext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: @concat_zext_v8i16_v16i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[R:%.*]] = zext <16 x i16> [[TMP1]] to <16 x i32>
+; CHECK-NEXT:    ret <16 x i32> [[R]]
+;
+  %x0 = zext <8 x i16> %a0 to <8 x i32>
+  %x1 = zext <8 x i16> %a1 to <8 x i32>
+  %r = shufflevector <8 x i32> %x0, <8 x i32> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i32> %r
+}
+
+define <16 x i32> @concat_zext_nneg_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: @concat_zext_nneg_v8i16_v16i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[R:%.*]] = zext nneg <16 x i16> [[TMP1]] to <16 x i32>
+; CHECK-NEXT:    ret <16 x i32> [[R]]
+;
+  %x0 = zext nneg <8 x i16> %a0 to <8 x i32>
+  %x1 = zext nneg <8 x i16> %a1 to <8 x i32>
+  %r = shufflevector <8 x i32> %x0, <8 x i32> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i32> %r
+}
+
+define <16 x i32> @concat_sext_zext_nneg_v8i16_v8i32(<8 x i16> %a0, <8 x i16> %a1) {
+; SSE-LABEL: @concat_sext_zext_nneg_v8i16_v8i32(
+; SSE-NEXT:    [[X0:%.*]] = sext <8 x i16> [[A0:%.*]] to <8 x i32>
+; SSE-NEXT:    [[X1:%.*]] = zext nneg <8 x i16> [[A1:%.*]] to <8 x i32>
+; SSE-NEXT:    [[R:%.*]] = shufflevector <8 x i32> [[X0]], <8 x i32> [[X1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <16 x i32> [[R]]
+;
+; AVX-LABEL: @concat_sext_zext_nneg_v8i16_v8i32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:    [[R:%.*]] = sext <16 x i16> [[TMP1]] to <16 x i32>
+; AVX-NEXT:    ret <16 x i32> [[R]]
+;
+  %x0 = sext <8 x i16> %a0 to <8 x i32>
+  %x1 = zext nneg <8 x i16> %a1 to <8 x i32>
+  %r = shufflevector <8 x i32> %x0, <8 x i32> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i32> %r
+}
+
+define <16 x i32> @concat_sext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: @concat_sext_v8i16_v16i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[R:%.*]] = sext <16 x i16> [[TMP1]] to <16 x i32>
+; CHECK-NEXT:    ret <16 x i32> [[R]]
+;
+  %x0 = sext <8 x i16> %a0 to <8 x i32>
+  %x1 = sext <8 x i16> %a1 to <8 x i32>
+  %r = shufflevector <8 x i32> %x0, <8 x i32> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i32> %r
+}
+
+define <8 x i32> @concat_sext_v4i1_v8i32(<4 x i1> %a0, <4 x i1> %a1) {
+; CHECK-LABEL: @concat_sext_v4i1_v8i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i1> [[A0:%.*]], <4 x i1> [[A1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    ret <8 x i32> [[R]]
+;
+  %x0 = sext <4 x i1> %a0 to <4 x i32>
+  %x1 = sext <4 x i1> %a1 to <4 x i32>
+  %r = shufflevector <4 x i32> %x0, <4 x i32> %x1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %r
+}
+
+define <8 x i16> @concat_trunc_v4i32_v8i16(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @concat_trunc_v4i32_v8i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = trunc <8 x i32> [[TMP1]] to <8 x i16>
+; CHECK-NEXT:    ret <8 x i16> [[R]]
+;
+  %x0 = trunc <4 x i32> %a0 to <4 x i16>
+  %x1 = trunc <4 x i32> %a1 to <4 x i16>
+  %r = shufflevector <4 x i16> %x0, <4 x i16> %x1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %r
+}
+
+define <8 x ptr> @concat_inttoptr_v4i32_v8iptr(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @concat_inttoptr_v4i32_v8iptr(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = inttoptr <8 x i32> [[TMP1]] to <8 x ptr>
+; CHECK-NEXT:    ret <8 x ptr> [[R]]
+;
+  %x0 = inttoptr <4 x i32> %a0 to <4 x ptr>
+  %x1 = inttoptr <4 x i32> %a1 to <4 x ptr>
+  %r = shufflevector <4 x ptr> %x0, <4 x ptr> %x1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x ptr> %r
+}
+
+define <16 x i64> @concat_ptrtoint_v8i16_v16i32(<8 x ptr> %a0, <8 x ptr> %a1) {
+; CHECK-LABEL: @concat_ptrtoint_v8i16_v16i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x ptr> [[A0:%.*]], <8 x ptr> [[A1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[R:%.*]] = ptrtoint <16 x ptr> [[TMP1]] to <16 x i64>
+; CHECK-NEXT:    ret <16 x i64> [[R]]
+;
+  %x0 = ptrtoint <8 x ptr> %a0 to <8 x i64>
+  %x1 = ptrtoint <8 x ptr> %a1 to <8 x i64>
+  %r = shufflevector <8 x i64> %x0, <8 x i64> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i64> %r
+}
+
+define <8 x double> @concat_fpext_v4f32_v8f64(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: @concat_fpext_v4f32_v8f64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[R:%.*]] = fpext <8 x float> [[TMP1]] to <8 x double>
+; SSE-NEXT:    ret <8 x double> [[R]]
+;
+; AVX-LABEL: @concat_fpext_v4f32_v8f64(
+; AVX-NEXT:    [[X0:%.*]] = fpext <4 x float> [[A0:%.*]] to <4 x double>
+; AVX-NEXT:    [[X1:%.*]] = fpext <4 x float> [[A1:%.*]] to <4 x double>
+; AVX-NEXT:    [[R:%.*]] = shufflevector <4 x double> [[X0]], <4 x double> [[X1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    ret <8 x double> [[R]]
+;
+  %x0 = fpext <4 x float> %a0 to <4 x double>
+  %x1 = fpext <4 x float> %a1 to <4 x double>
+  %r = shufflevector <4 x double> %x0, <4 x double> %x1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %r
+}
+
+define <16 x float> @concat_fptrunc_v8f64_v16f32(<8 x double> %a0, <8 x double> %a1) {
+; CHECK-LABEL: @concat_fptrunc_v8f64_v16f32(
+; CHECK-NEXT:    [[X0:%.*]] = fptrunc <8 x double> [[A0:%.*]] to <8 x float>
+; CHECK-NEXT:    [[X1:%.*]] = fptrunc <8 x double> [[A1:%.*]] to <8 x float>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x float> [[X0]], <8 x float> [[X1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <16 x float> [[R]]
+;
+  %x0 = fptrunc <8 x double> %a0 to <8 x float>
+  %x1 = fptrunc <8 x double> %a1 to <8 x float>
+  %r = shufflevector <8 x float> %x0, <8 x float> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %r
+}
+
+; commuted vector concatenation
+
+define <16 x i32> @rconcat_sext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: @rconcat_sext_v8i16_v16i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = sext <16 x i16> [[TMP1]] to <16 x i32>
+; CHECK-NEXT:    ret <16 x i32> [[R]]
+;
+  %x0 = sext <8 x i16> %a0 to <8 x i32>
+  %x1 = sext <8 x i16> %a1 to <8 x i32>
+  %r = shufflevector <8 x i32> %x0, <8 x i32> %x1, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <16 x i32> %r
+}
+
+; interleaved shuffle
+
+define <8 x double> @interleave_fpext_v4f32_v8f64(<4 x float> %a0, <4 x float> %a1) {
+; CHECK-LABEL: @interleave_fpext_v4f32_v8f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = fpext <8 x float> [[TMP1]] to <8 x double>
+; CHECK-NEXT:    ret <8 x double> [[R]]
+;
+  %x0 = fpext <4 x float> %a0 to <4 x double>
+  %x1 = fpext <4 x float> %a1 to <4 x double>
+  %r = shufflevector <4 x double> %x0, <4 x double> %x1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  ret <8 x double> %r
+}
+
+; negative - multiuse
+
+define <8 x i16> @concat_trunc_v4i32_v8i16_multiuse(<4 x i32> %a0, <4 x i32> %a1, ptr %a2) {
+; CHECK-LABEL: @concat_trunc_v4i32_v8i16_multiuse(
+; CHECK-NEXT:    [[X0:%.*]] = trunc <4 x i32> [[A0:%.*]] to <4 x i16>
+; CHECK-NEXT:    [[X1:%.*]] = trunc <4 x i32> [[A1:%.*]] to <4 x i16>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i16> [[X0]], <4 x i16> [[X1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    store <4 x i16> [[X0]], ptr [[A2:%.*]], align 8
+; CHECK-NEXT:    ret <8 x i16> [[R]]
+;
+  %x0 = trunc <4 x i32> %a0 to <4 x i16>
+  %x1 = trunc <4 x i32> %a1 to <4 x i16>
+  %r = shufflevector <4 x i16> %x0, <4 x i16> %x1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <4 x i16> %x0, ptr %a2
+  ret <8 x i16> %r
+}
+
+; negative - bitcasts
+
+define <8 x float> @concat_bitcast_v4i32_v8f32(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @concat_bitcast_v4i32_v8f32(
+; CHECK-NEXT:    [[X0:%.*]] = bitcast <4 x i32> [[A0:%.*]] to <4 x float>
+; CHECK-NEXT:    [[X1:%.*]] = bitcast <4 x i32> [[A1:%.*]] to <4 x float>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[X0]], <4 x float> [[X1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[R]]
+;
+  %x0 = bitcast <4 x i32> %a0 to <4 x float>
+  %x1 = bitcast <4 x i32> %a1 to <4 x float>
+  %r = shufflevector <4 x float> %x0, <4 x float> %x1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %r
+}
+
+; negative - src type mismatch
+
+define <8 x i32> @concat_sext_v4i8_v4i16_v8i32(<4 x i8> %a0, <4 x i16> %a1) {
+; CHECK-LABEL: @concat_sext_v4i8_v4i16_v8i32(
+; CHECK-NEXT:    [[X0:%.*]] = sext <4 x i8> [[A0:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[X1:%.*]] = sext <4 x i16> [[A1:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[X0]], <4 x i32> [[X1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[R]]
+;
+  %x0 = sext <4 x i8> %a0 to <4 x i32>
+  %x1 = sext <4 x i16> %a1 to <4 x i32>
+  %r = shufflevector <4 x i32> %x0, <4 x i32> %x1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %r
+}
+
+; negative - castop mismatch
+
+define <16 x i32> @concat_sext_zext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: @concat_sext_zext_v8i16_v16i32(
+; CHECK-NEXT:    [[X0:%.*]] = sext <8 x i16> [[A0:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[X1:%.*]] = zext <8 x i16> [[A1:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i32> [[X0]], <8 x i32> [[X1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <16 x i32> [[R]]
+;
+  %x0 = sext <8 x i16> %a0 to <8 x i32>
+  %x1 = zext <8 x i16> %a1 to <8 x i32>
+  %r = shufflevector <8 x i32> %x0, <8 x i32> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i32> %r
+}