diff options
Diffstat (limited to 'llvm/test/Transforms')
90 files changed, 1481 insertions, 1116 deletions
diff --git a/llvm/test/Transforms/ConstraintElimination/add-nsw.ll b/llvm/test/Transforms/ConstraintElimination/add-nsw.ll index 5127e92..4b8ac09 100644 --- a/llvm/test/Transforms/ConstraintElimination/add-nsw.ll +++ b/llvm/test/Transforms/ConstraintElimination/add-nsw.ll @@ -757,8 +757,7 @@ define i1 @add_neg_1_known_sge_ult_1(i32 %a) { ; CHECK-NEXT: [[A_SGE:%.*]] = icmp sge i32 [[A:%.*]], 1 ; CHECK-NEXT: call void @llvm.assume(i1 [[A_SGE]]) ; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[A]], -1 -; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[SUB]], [[A]] -; CHECK-NEXT: ret i1 [[C]] +; CHECK-NEXT: ret i1 true ; entry: %a.sge = icmp sge i32 %a, 1 @@ -823,8 +822,7 @@ define i1 @add_neg_3_known_sge_ult_1(i32 %a) { ; CHECK-NEXT: [[A_SGE:%.*]] = icmp sge i32 [[A:%.*]], 3 ; CHECK-NEXT: call void @llvm.assume(i1 [[A_SGE]]) ; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[A]], -3 -; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[SUB]], [[A]] -; CHECK-NEXT: ret i1 [[C]] +; CHECK-NEXT: ret i1 true ; entry: %a.sge = icmp sge i32 %a, 3 diff --git a/llvm/test/Transforms/ConstraintElimination/gep-arithmetic-add.ll b/llvm/test/Transforms/ConstraintElimination/gep-arithmetic-add.ll index 52adc78..8dcac78 100644 --- a/llvm/test/Transforms/ConstraintElimination/gep-arithmetic-add.ll +++ b/llvm/test/Transforms/ConstraintElimination/gep-arithmetic-add.ll @@ -389,8 +389,7 @@ define i1 @gep_count_add_1_sge_known_ult_1(i32 %count, ptr %p) { ; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[COUNT]], -1 ; CHECK-NEXT: [[SUB_EXT:%.*]] = zext i32 [[SUB]] to i64 ; CHECK-NEXT: [[GEP_SUB:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[SUB_EXT]] -; CHECK-NEXT: [[C:%.*]] = icmp ult ptr [[GEP_SUB]], [[GEP_COUNT]] -; CHECK-NEXT: ret i1 [[C]] +; CHECK-NEXT: ret i1 true ; entry: %sge = icmp sge i32 %count, 1 @@ -415,8 +414,7 @@ define i1 @gep_count_add_1_sge_known_uge_1(i32 %count, ptr %p) { ; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[COUNT]], -1 ; CHECK-NEXT: [[SUB_EXT:%.*]] = zext i32 [[SUB]] to i64 ; CHECK-NEXT: [[GEP_SUB:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[SUB_EXT]] -; CHECK-NEXT: [[C:%.*]] = icmp uge ptr [[GEP_SUB]], [[P]] -; CHECK-NEXT: ret i1 [[C]] +; CHECK-NEXT: ret i1 true ; entry: %sge = icmp sge i32 %count, 1 diff --git a/llvm/test/Transforms/DeadArgElim/dbginfo.ll b/llvm/test/Transforms/DeadArgElim/dbginfo.ll index a27ca9d..f6313c7 100644 --- a/llvm/test/Transforms/DeadArgElim/dbginfo.ll +++ b/llvm/test/Transforms/DeadArgElim/dbginfo.ll @@ -21,14 +21,14 @@ ; updated LLVM functions. ; Function Attrs: uwtable -define void @_Z2f2v() #0 !dbg !4 { +define void @_Z2f2v() !dbg !4 { entry: call void (i32, ...) @_ZL2f1iz(i32 1), !dbg !15 ret void, !dbg !16 } ; Function Attrs: nounwind uwtable -define internal void @_ZL2f1iz(i32, ...) #1 !dbg !8 { +define internal void @_ZL2f1iz(i32, ...) !dbg !8 { entry: call void @llvm.dbg.value(metadata i32 %0, metadata !17, metadata !18), !dbg !19 ret void, !dbg !20 @@ -40,8 +40,6 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) #2 ; Function Attrs: nounwind readnone declare void @llvm.dbg.value(metadata, metadata, metadata) #2 -attributes #0 = { uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #2 = { nounwind readnone } !llvm.dbg.cu = !{!0} diff --git a/llvm/test/Transforms/DeadStoreElimination/mda-with-dbg-values.ll b/llvm/test/Transforms/DeadStoreElimination/mda-with-dbg-values.ll index f66aa0f..2cec6b5 100644 --- a/llvm/test/Transforms/DeadStoreElimination/mda-with-dbg-values.ll +++ b/llvm/test/Transforms/DeadStoreElimination/mda-with-dbg-values.ll @@ -14,7 +14,7 @@ target triple = "x86_64-unknown-linux-gnu" @g = common global [1 x i8] zeroinitializer, align 1, !dbg !0 ; Function Attrs: noinline nounwind uwtable -define void @foo() #0 !dbg !14 { +define void @foo() !dbg !14 { entry: %i = alloca i8, align 1 store i8 1, ptr %i, align 1, !dbg !19 @@ -37,7 +37,6 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1 ; Function Attrs: argmemonly nounwind declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1) #2 -attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind readnone speculatable } attributes #2 = { argmemonly nounwind } diff --git a/llvm/test/Transforms/FunctionImport/Inputs/funcimport_debug.ll b/llvm/test/Transforms/FunctionImport/Inputs/funcimport_debug.ll index 4d3a241..628669d 100644 --- a/llvm/test/Transforms/FunctionImport/Inputs/funcimport_debug.ll +++ b/llvm/test/Transforms/FunctionImport/Inputs/funcimport_debug.ll @@ -3,13 +3,11 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16 target triple = "x86_64-unknown-linux-gnu" ; Function Attrs: nounwind uwtable -define void @func() #0 !dbg !4 { +define void @func() !dbg !4 { entry: ret void, !dbg !10 } -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!7, !8} !llvm.ident = !{!9} diff --git a/llvm/test/Transforms/FunctionImport/funcimport_debug.ll b/llvm/test/Transforms/FunctionImport/funcimport_debug.ll index f449047..713dc4e 100644 --- a/llvm/test/Transforms/FunctionImport/funcimport_debug.ll +++ b/llvm/test/Transforms/FunctionImport/funcimport_debug.ll @@ -24,16 +24,13 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16 target triple = "x86_64-unknown-linux-gnu" ; Function Attrs: nounwind uwtable -define i32 @main() #0 !dbg !4 { +define i32 @main() !dbg !4 { entry: call void (...) @func(), !dbg !11 ret i32 0, !dbg !12 } -declare void @func(...) #1 - -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } +declare void @func(...) !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!8, !9} diff --git a/llvm/test/Transforms/GCOVProfiling/exit-block.ll b/llvm/test/Transforms/GCOVProfiling/exit-block.ll index 1840f04..543edac 100644 --- a/llvm/test/Transforms/GCOVProfiling/exit-block.ll +++ b/llvm/test/Transforms/GCOVProfiling/exit-block.ll @@ -13,7 +13,7 @@ target triple = "x86_64-unknown-linux-gnu" @A = common global i32 0, align 4, !dbg !9 ; Function Attrs: nounwind uwtable -define void @test() #0 !dbg !4 { +define void @test() !dbg !4 { entry: tail call void (...) @f() #2, !dbg !14 %0 = load i32, ptr @A, align 4, !dbg !15 @@ -28,12 +28,10 @@ if.end: ; preds = %entry, %if.then ret void, !dbg !18 } -declare void @f(...) #1 +declare void @f(...) -declare void @g(...) #1 +declare void @g(...) -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #2 = { nounwind } !llvm.gcov = !{!19} diff --git a/llvm/test/Transforms/GCOVProfiling/linezero.ll b/llvm/test/Transforms/GCOVProfiling/linezero.ll index 8bfeabd..1eae413 100644 --- a/llvm/test/Transforms/GCOVProfiling/linezero.ll +++ b/llvm/test/Transforms/GCOVProfiling/linezero.ll @@ -9,7 +9,7 @@ target triple = "x86_64-unknown-linux-gnu" %struct.vector = type { i8 } ; Function Attrs: nounwind -define i32 @_Z4testv() #0 !dbg !15 { +define i32 @_Z4testv() !dbg !15 { entry: %retval = alloca i32, align 4 %__range = alloca ptr, align 8 @@ -63,19 +63,19 @@ return: ; No predecessors! } ; Function Attrs: nounwind readnone -declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 +declare void @llvm.dbg.declare(metadata, metadata, metadata) -declare void @_Z13TagFieldSpecsv() #2 +declare void @_Z13TagFieldSpecsv() -declare ptr @_ZN6vector5beginEv(ptr) #2 +declare ptr @_ZN6vector5beginEv(ptr) -declare ptr @_ZN6vector3endEv(ptr) #2 +declare ptr @_ZN6vector3endEv(ptr) ; Function Attrs: noreturn nounwind -declare void @llvm.trap() #3 +declare void @llvm.trap() ; Function Attrs: nounwind -define void @_Z2f1v() #0 !dbg !20 { +define void @_Z2f1v() !dbg !20 { entry: br label %0 @@ -83,11 +83,6 @@ entry: ret void, !dbg !45 } -attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone } -attributes #2 = { "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { noreturn nounwind } - !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!23, !24} !llvm.gcov = !{!25} diff --git a/llvm/test/Transforms/GCOVProfiling/split-indirectbr-critical-edges.ll b/llvm/test/Transforms/GCOVProfiling/split-indirectbr-critical-edges.ll index 7f169e2..98bdd74 100644 --- a/llvm/test/Transforms/GCOVProfiling/split-indirectbr-critical-edges.ll +++ b/llvm/test/Transforms/GCOVProfiling/split-indirectbr-critical-edges.ll @@ -10,7 +10,7 @@ ; CHECK-NEXT: load {{.*}} @__llvm_gcov_ctr ; CHECK-NOT: load {{.*}} @__llvm_gcov_ctr -define dso_local i32 @cannot_split(ptr nocapture readonly %p) #0 !dbg !7 { +define dso_local i32 @cannot_split(ptr nocapture readonly %p) !dbg !7 { entry: %targets = alloca <2 x ptr>, align 16 store <2 x ptr> <ptr blockaddress(@cannot_split, %indirect), ptr blockaddress(@cannot_split, %end)>, ptr %targets, align 16, !dbg !9 @@ -42,8 +42,6 @@ end: ; preds = %indirect ret i32 0, !dbg !22 } -attributes #0 = { norecurse nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4, !5} diff --git a/llvm/test/Transforms/GVN/cond_br2.ll b/llvm/test/Transforms/GVN/cond_br2.ll index 6ceec95..4811ad0 100644 --- a/llvm/test/Transforms/GVN/cond_br2.ll +++ b/llvm/test/Transforms/GVN/cond_br2.ll @@ -11,12 +11,12 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 %"union.llvm::SmallVectorBase::U" = type { x86_fp80 } ; Function Attrs: ssp uwtable -define void @_Z4testv() #0 personality ptr @__gxx_personality_v0 { +define void @_Z4testv() personality ptr @__gxx_personality_v0 { ; CHECK-LABEL: define void @_Z4testv( -; CHECK-SAME: ) #[[ATTR0:[0-9]+]] personality ptr @__gxx_personality_v0 { +; CHECK-SAME: ) personality ptr @__gxx_personality_v0 { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[SV:%.*]] = alloca %"class.llvm::SmallVector", align 16 -; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr [[SV]]) #[[ATTR4:[0-9]+]] +; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr [[SV]]) ; CHECK-NEXT: [[FIRSTEL_I_I_I_I_I_I:%.*]] = getelementptr inbounds %"class.llvm::SmallVector", ptr [[SV]], i64 0, i32 0, i32 0, i32 0, i32 0, i32 3 ; CHECK-NEXT: store ptr [[FIRSTEL_I_I_I_I_I_I]], ptr [[SV]], align 16, !tbaa [[ANYPTR_TBAA0:![0-9]+]] ; CHECK-NEXT: [[ENDX_I_I_I_I_I_I:%.*]] = getelementptr inbounds %"class.llvm::SmallVector", ptr [[SV]], i64 0, i32 0, i32 0, i32 0, i32 0, i32 1 @@ -66,10 +66,10 @@ define void @_Z4testv() #0 personality ptr @__gxx_personality_v0 { ; CHECK-NEXT: [[CMP_I_I_I_I19:%.*]] = icmp eq ptr [[TMP0]], [[FIRSTEL_I_I_I_I_I_I]] ; CHECK-NEXT: br i1 [[CMP_I_I_I_I19]], label %[[_ZN4LLVM11SMALLVECTORIILJ8EED1EV_EXIT21:.*]], label %[[IF_THEN_I_I_I20:.*]] ; CHECK: [[IF_THEN_I_I_I20]]: -; CHECK-NEXT: call void @free(ptr [[TMP0]]) #[[ATTR4]] +; CHECK-NEXT: call void @free(ptr [[TMP0]]) ; CHECK-NEXT: br label %[[_ZN4LLVM11SMALLVECTORIILJ8EED1EV_EXIT21]] ; CHECK: [[_ZN4LLVM11SMALLVECTORIILJ8EED1EV_EXIT21]]: -; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr [[SV]]) #[[ATTR4]] +; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr [[SV]]) ; CHECK-NEXT: ret void ; CHECK: [[LPAD]]: ; CHECK-NEXT: [[TMP1:%.*]] = landingpad { ptr, i32 } @@ -78,7 +78,7 @@ define void @_Z4testv() #0 personality ptr @__gxx_personality_v0 { ; CHECK-NEXT: [[CMP_I_I_I_I:%.*]] = icmp eq ptr [[TMP2]], [[FIRSTEL_I_I_I_I_I_I]] ; CHECK-NEXT: br i1 [[CMP_I_I_I_I]], label %[[EH_RESUME:.*]], label %[[IF_THEN_I_I_I:.*]] ; CHECK: [[IF_THEN_I_I_I]]: -; CHECK-NEXT: call void @free(ptr [[TMP2]]) #[[ATTR4]] +; CHECK-NEXT: call void @free(ptr [[TMP2]]) ; CHECK-NEXT: br label %[[EH_RESUME]] ; CHECK: [[EH_RESUME]]: ; CHECK-NEXT: resume { ptr, i32 } [[TMP1]] @@ -86,7 +86,7 @@ define void @_Z4testv() #0 personality ptr @__gxx_personality_v0 { entry: %sv = alloca %"class.llvm::SmallVector", align 16 - call void @llvm.lifetime.start.p0(ptr %sv) #1 + call void @llvm.lifetime.start.p0(ptr %sv) %FirstEl.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector", ptr %sv, i64 0, i32 0, i32 0, i32 0, i32 0, i32 3 store ptr %FirstEl.i.i.i.i.i.i, ptr %sv, align 16, !tbaa !4 %EndX.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector", ptr %sv, i64 0, i32 0, i32 0, i32 0, i32 0, i32 1 @@ -151,11 +151,11 @@ invoke.cont3: ; preds = %invoke.cont2 br i1 %cmp.i.i.i.i19, label %_ZN4llvm11SmallVectorIiLj8EED1Ev.exit21, label %if.then.i.i.i20 if.then.i.i.i20: ; preds = %invoke.cont3 - call void @free(ptr %5) #1 + call void @free(ptr %5) br label %_ZN4llvm11SmallVectorIiLj8EED1Ev.exit21 _ZN4llvm11SmallVectorIiLj8EED1Ev.exit21: ; preds = %invoke.cont3, %if.then.i.i.i20 - call void @llvm.lifetime.end.p0(ptr %sv) #1 + call void @llvm.lifetime.end.p0(ptr %sv) ret void lpad: ; preds = %if.end.i14, %if.end.i, %invoke.cont2 @@ -166,7 +166,7 @@ lpad: ; preds = %if.end.i14, %if.end br i1 %cmp.i.i.i.i, label %eh.resume, label %if.then.i.i.i if.then.i.i.i: ; preds = %lpad - call void @free(ptr %7) #1 + call void @free(ptr %7) br label %eh.resume eh.resume: ; preds = %if.then.i.i.i, %lpad @@ -174,24 +174,19 @@ eh.resume: ; preds = %if.then.i.i.i, %lpa } ; Function Attrs: nounwind -declare void @llvm.lifetime.start.p0(ptr nocapture) #1 +declare void @llvm.lifetime.start.p0(ptr nocapture) declare i32 @__gxx_personality_v0(...) -declare void @_Z1gRN4llvm11SmallVectorIiLj8EEE(ptr) #2 +declare void @_Z1gRN4llvm11SmallVectorIiLj8EEE(ptr) ; Function Attrs: nounwind -declare void @llvm.lifetime.end.p0(ptr nocapture) #1 +declare void @llvm.lifetime.end.p0(ptr nocapture) -declare void @_ZN4llvm15SmallVectorBase8grow_podEmm(ptr, i64, i64) #2 +declare void @_ZN4llvm15SmallVectorBase8grow_podEmm(ptr, i64, i64) ; Function Attrs: nounwind -declare void @free(ptr nocapture) #3 - -attributes #0 = { ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind } -attributes #2 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +declare void @free(ptr nocapture) !0 = !{!"any pointer", !1} !1 = !{!"omnipotent char", !2} diff --git a/llvm/test/Transforms/GVN/pr33549.ll b/llvm/test/Transforms/GVN/pr33549.ll index a8ce37c..a1e28f7 100644 --- a/llvm/test/Transforms/GVN/pr33549.ll +++ b/llvm/test/Transforms/GVN/pr33549.ll @@ -4,9 +4,9 @@ @Data = common local_unnamed_addr global [32 x i32] zeroinitializer, align 4 ; Function Attrs: norecurse nounwind -define void @testshl() local_unnamed_addr #0 { +define void @testshl() local_unnamed_addr { ; CHECK-LABEL: define void @testshl( -; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-SAME: ) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: @@ -78,8 +78,6 @@ for.end10: ; preds = %for.inc8 ret void } -attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m7" "target-features"="+d16,+dsp,+fp-armv8,+hwdiv,+strict-align,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.module.flags = !{!0, !1} !llvm.ident = !{!2} diff --git a/llvm/test/Transforms/GVN/pr42605.ll b/llvm/test/Transforms/GVN/pr42605.ll index 3e6241c..98b447f8 100644 --- a/llvm/test/Transforms/GVN/pr42605.ll +++ b/llvm/test/Transforms/GVN/pr42605.ll @@ -114,7 +114,7 @@ if.end: ; preds = %if.then, %entry ret void } -attributes #0 = { noinline norecurse nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { noinline norecurse nounwind readonly uwtable } !llvm.module.flags = !{!0} !llvm.ident = !{!1} diff --git a/llvm/test/Transforms/GVNHoist/hoist-unsafe-pr31729.ll b/llvm/test/Transforms/GVNHoist/hoist-unsafe-pr31729.ll index b9c8537..082d016 100644 --- a/llvm/test/Transforms/GVNHoist/hoist-unsafe-pr31729.ll +++ b/llvm/test/Transforms/GVNHoist/hoist-unsafe-pr31729.ll @@ -14,7 +14,7 @@ @res = common global i32 0, align 4 ; Function Attrs: -define i64 @func() #0 { +define i64 @func() { entry: ret i64 1 } @@ -27,7 +27,7 @@ entry: %2 = load volatile i32, ptr @g_x_u, align 4 %3 = load volatile i32, ptr @g_z_u, align 4 %4 = load volatile i32, ptr @g_m, align 4 - %call = call i64 @func() #4 + %call = call i64 @func() %conv = sext i32 %1 to i64 %cmp = icmp ne i64 %call, %conv br i1 %cmp, label %if.end, label %lor.lhs.false @@ -42,7 +42,7 @@ if.then: br label %cleanup if.end: - %call4 = call i64 @func() #4 + %call4 = call i64 @func() %conv5 = zext i32 %3 to i64 %cmp6 = icmp ne i64 %call4, %conv5 br i1 %cmp6, label %if.end14, label %lor.lhs.false8 @@ -57,7 +57,7 @@ if.then13: br label %cleanup if.end14: - %call15 = call i64 @func() #4 + %call15 = call i64 @func() %cmp17 = icmp ne i64 %call15, %conv br i1 %cmp17, label %if.end25, label %lor.lhs.false19 @@ -77,5 +77,3 @@ cleanup: %retval.0 = phi i32 [ 0, %if.end25 ], [ 1, %if.then24 ], [ 1, %if.then13 ], [ 1, %if.then ] ret i32 %retval.0 } - -attributes #0 = { minsize noinline nounwind optsize uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/Transforms/GVNHoist/pr30499.ll b/llvm/test/Transforms/GVNHoist/pr30499.ll index bd6f98c..6df9484b 100644 --- a/llvm/test/Transforms/GVNHoist/pr30499.ll +++ b/llvm/test/Transforms/GVNHoist/pr30499.ll @@ -1,6 +1,6 @@ ; RUN: opt -S -passes=gvn-hoist < %s -define void @_Z3fn2v() #0 { +define void @_Z3fn2v() { entry: %a = alloca ptr, align 8 %b = alloca i32, align 4 @@ -11,7 +11,7 @@ entry: br i1 %tobool, label %if.then, label %if.end if.then: ; preds = %entry - %call = call i64 @_Z3fn1v() #2 + %call = call i64 @_Z3fn1v() %conv = trunc i64 %call to i32 store i32 %conv, ptr %b, align 4 br label %if.end @@ -23,8 +23,4 @@ if.end: ; preds = %if.then, %entry } ; Function Attrs: nounwind readonly -declare i64 @_Z3fn1v() #1 - -attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind readonly } +declare i64 @_Z3fn1v() diff --git a/llvm/test/Transforms/IndVarSimplify/X86/widen-nsw.ll b/llvm/test/Transforms/IndVarSimplify/X86/widen-nsw.ll index 7cba30d..81a9323 100644 --- a/llvm/test/Transforms/IndVarSimplify/X86/widen-nsw.ll +++ b/llvm/test/Transforms/IndVarSimplify/X86/widen-nsw.ll @@ -4,7 +4,7 @@ target triple = "x86_64-apple-macosx" ; CHECK-LABEL: @test1 ; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 -define i32 @test1(ptr %a) #0 { +define i32 @test1(ptr %a) { entry: br label %for.cond @@ -25,5 +25,3 @@ for.body: ; preds = %for.cond for.end: ; preds = %for.cond ret i32 %sum.0 } - -attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/Transforms/Inline/always-inline-attr.ll b/llvm/test/Transforms/Inline/always-inline-attr.ll index 08ea307..4e69822 100644 --- a/llvm/test/Transforms/Inline/always-inline-attr.ll +++ b/llvm/test/Transforms/Inline/always-inline-attr.ll @@ -6,7 +6,7 @@ target triple = "x86_64-grtev4-linux-gnu" ; After AlwaysInline the callee's attributes should be merged into caller's attibutes. -; CHECK: define dso_local <2 x i64> @foo(ptr byval(<8 x i64>) align 64 %0) #0 +; CHECK: define dso_local <2 x i64> @foo(ptr byval(<8 x i64>) align 64 %0) ; CHECK: attributes #0 = { mustprogress uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="512" ; Function Attrs: uwtable mustprogress @@ -36,12 +36,10 @@ entry: } ; Function Attrs: nounwind readnone -declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16) #2 - -attributes #0 = { uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "prefer-vector-width"="128" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+aes,+avx,+avx2,+avx512bw,+avx512dq,+avx512f,+avx512vl,+bmi2,+cx16,+cx8,+f16c,+fma,+fxsr,+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { alwaysinline nounwind uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="512" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "prefer-vector-width"="128" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+aes,+avx,+avx2,+avx512f,+cx16,+cx8,+f16c,+fma,+fxsr,+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind readnone } +declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16) +attributes #0 = { uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "prefer-vector-width"="128" } +attributes #1 = { alwaysinline nounwind uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="512" "prefer-vector-width"="128" } !2 = !{!3, !3, i64 0} !3 = !{!"omnipotent char", !4, i64 0} diff --git a/llvm/test/Transforms/Inline/debug-info-duplicate-calls.ll b/llvm/test/Transforms/Inline/debug-info-duplicate-calls.ll index a4c4134..bf0dfa2 100644 --- a/llvm/test/Transforms/Inline/debug-info-duplicate-calls.ll +++ b/llvm/test/Transforms/Inline/debug-info-duplicate-calls.ll @@ -89,11 +89,10 @@ entry: ret void, !dbg !20 } -declare void @_Z2f1v() #2 +attributes #0 = { uwtable } +attributes #1 = { alwaysinline inlinehint uwtable } -attributes #0 = { uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { alwaysinline inlinehint uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +declare void @_Z2f1v() !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!10, !11} diff --git a/llvm/test/Transforms/Inline/inline-vla.ll b/llvm/test/Transforms/Inline/inline-vla.ll index 8e4bb3d..1e3c235 100644 --- a/llvm/test/Transforms/Inline/inline-vla.ll +++ b/llvm/test/Transforms/Inline/inline-vla.ll @@ -9,7 +9,7 @@ @.str1 = private unnamed_addr constant [3 x i8] c"ab\00", align 1 ; Function Attrs: nounwind ssp uwtable -define i32 @main(i32 %argc, ptr nocapture readnone %argv) #0 { +define i32 @main(i32 %argc, ptr nocapture readnone %argv) { entry: %data = alloca [2 x i8], align 1 call fastcc void @memcpy2(ptr %data, ptr @.str, i64 1) @@ -18,7 +18,7 @@ entry: } ; Function Attrs: inlinehint nounwind ssp uwtable -define internal fastcc void @memcpy2(ptr nocapture %dst, ptr nocapture readonly %src, i64 %size) #1 { +define internal fastcc void @memcpy2(ptr nocapture %dst, ptr nocapture readonly %src, i64 %size) { entry: %vla = alloca i64, i64 %size, align 16 call void @llvm.memcpy.p0.p0.i64(ptr %vla, ptr %src, i64 %size, i1 false) @@ -27,11 +27,7 @@ entry: } ; Function Attrs: nounwind -declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1) #2 - -attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { inlinehint nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind } +declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1) !llvm.ident = !{!0} diff --git a/llvm/test/Transforms/Inline/optimization-remarks-hotness-threshold.ll b/llvm/test/Transforms/Inline/optimization-remarks-hotness-threshold.ll index 3021935..2b4aedd 100644 --- a/llvm/test/Transforms/Inline/optimization-remarks-hotness-threshold.ll +++ b/llvm/test/Transforms/Inline/optimization-remarks-hotness-threshold.ll @@ -27,20 +27,18 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.11.0" ; Function Attrs: nounwind ssp uwtable -define i32 @foo() #0 !dbg !7 { +define i32 @foo() !dbg !7 { entry: ret i32 1, !dbg !9 } ; Function Attrs: nounwind ssp uwtable -define i32 @bar() #0 !dbg !10 { +define i32 @bar() !dbg !10 { entry: %call = call i32 @foo(), !dbg !11 ret i32 %call, !dbg !12 } -attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4, !5} !llvm.ident = !{!6} diff --git a/llvm/test/Transforms/Inline/optimization-remarks-passed-deleted-callee-yaml.ll b/llvm/test/Transforms/Inline/optimization-remarks-passed-deleted-callee-yaml.ll index d394713..f547c37 100644 --- a/llvm/test/Transforms/Inline/optimization-remarks-passed-deleted-callee-yaml.ll +++ b/llvm/test/Transforms/Inline/optimization-remarks-passed-deleted-callee-yaml.ll @@ -63,20 +63,18 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.11.0" ; Function Attrs: nounwind ssp uwtable -define internal i32 @foo() #0 !dbg !7 { +define internal i32 @foo() !dbg !7 { entry: ret i32 1, !dbg !9 } ; Function Attrs: nounwind ssp uwtable -define i32 @bar() #0 !dbg !10 !prof !13 { +define i32 @bar() !dbg !10 !prof !13 { entry: %call = call i32 @foo(), !dbg !11 ret i32 %call, !dbg !12 } -attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4, !5} !llvm.ident = !{!6} diff --git a/llvm/test/Transforms/Inline/optimization-remarks-passed-yaml.ll b/llvm/test/Transforms/Inline/optimization-remarks-passed-yaml.ll index b0a238f..64b305b 100644 --- a/llvm/test/Transforms/Inline/optimization-remarks-passed-yaml.ll +++ b/llvm/test/Transforms/Inline/optimization-remarks-passed-yaml.ll @@ -72,20 +72,18 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.11.0" ; Function Attrs: nounwind ssp uwtable -define i32 @foo() #0 !dbg !7 { +define i32 @foo() !dbg !7 { entry: ret i32 1, !dbg !9 } ; Function Attrs: nounwind ssp uwtable -define i32 @bar() #0 !dbg !10 !prof !13 { +define i32 @bar() !dbg !10 !prof !13 { entry: %call = call i32 @foo(), !dbg !11 ret i32 %call, !dbg !12 } -attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4, !5} !llvm.ident = !{!6} diff --git a/llvm/test/Transforms/Inline/optimization-remarks.ll b/llvm/test/Transforms/Inline/optimization-remarks.ll index bc1e690..135d835 100644 --- a/llvm/test/Transforms/Inline/optimization-remarks.ll +++ b/llvm/test/Transforms/Inline/optimization-remarks.ll @@ -81,9 +81,9 @@ entry: ret i32 %add } -attributes #0 = { alwaysinline nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { noinline nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { alwaysinline nounwind uwtable } +attributes #1 = { noinline nounwind uwtable } +attributes #2 = { nounwind uwtable } !llvm.ident = !{!0} diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-to-svbool-binops.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-to-svbool-binops.ll index ecedbdb..abe1ed0 100644 --- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-to-svbool-binops.ll +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-to-svbool-binops.ll @@ -124,6 +124,39 @@ define <vscale x 8 x i1> @try_combine_svbool_binop_orr(<vscale x 8 x i1> %a, <vs ret <vscale x 8 x i1> %t3 } +; Verify predicate cast does not hinder "isAllActive" knowledge. +define <vscale x 8 x half> @try_combine_svbool_binop_fadd(<vscale x 8 x half> %a, <vscale x 8 x half> %b) { +; CHECK-LABEL: @try_combine_svbool_binop_fadd( +; CHECK-NEXT: [[T2:%.*]] = fadd <vscale x 8 x half> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: ret <vscale x 8 x half> [[T2]] +; + %t1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> splat (i1 true)) + %t2 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %t1, <vscale x 8 x half> %a, <vscale x 8 x half> %b) + ret <vscale x 8 x half> %t2 +} + +; Verify predicate cast does not hinder "isAllActive" knowledge. +define <vscale x 4 x float> @try_combine_svbool_binop_fmul(<vscale x 4 x float> %a, <vscale x 4 x float> %b) { +; CHECK-LABEL: @try_combine_svbool_binop_fmul( +; CHECK-NEXT: [[T2:%.*]] = fmul <vscale x 4 x float> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: ret <vscale x 4 x float> [[T2]] +; + %t1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> splat (i1 true)) + %t2 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1> %t1, <vscale x 4 x float> %a, <vscale x 4 x float> %b) + ret <vscale x 4 x float> %t2 +} + +; Verify predicate cast does not hinder "isAllActive" knowledge. +define <vscale x 2 x double> @try_combine_svbool_binop_fsub(<vscale x 2 x double> %a, <vscale x 2 x double> %b) { +; CHECK-LABEL: @try_combine_svbool_binop_fsub( +; CHECK-NEXT: [[T2:%.*]] = fsub <vscale x 2 x double> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: ret <vscale x 2 x double> [[T2]] +; + %t1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> splat (i1 true)) + %t2 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fsub.nxv2f64(<vscale x 2 x i1> %t1, <vscale x 2 x double> %a, <vscale x 2 x double> %b) + ret <vscale x 2 x double> %t2 +} + declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1>) declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1>) declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1>) diff --git a/llvm/test/Transforms/InstCombine/bitreverse-hang.ll b/llvm/test/Transforms/InstCombine/bitreverse-hang.ll index bb01299..3f29509 100644 --- a/llvm/test/Transforms/InstCombine/bitreverse-hang.ll +++ b/llvm/test/Transforms/InstCombine/bitreverse-hang.ll @@ -21,7 +21,7 @@ @b = common global i32 0, align 4 ; CHECK: define i32 @fn1 -define i32 @fn1() #0 { +define i32 @fn1() { entry: %b.promoted = load i32, ptr @b, align 4, !tbaa !2 br label %for.body @@ -40,8 +40,6 @@ for.end: ; preds = %for.body ret i32 undef } -attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.module.flags = !{!0} !llvm.ident = !{!1} diff --git a/llvm/test/Transforms/InstCombine/select-extractelement-inseltpoison.ll b/llvm/test/Transforms/InstCombine/select-extractelement-inseltpoison.ll index 2348490..2d06f42 100644 --- a/llvm/test/Transforms/InstCombine/select-extractelement-inseltpoison.ll +++ b/llvm/test/Transforms/InstCombine/select-extractelement-inseltpoison.ll @@ -208,6 +208,3 @@ define <4 x i32> @extract_cond_type_mismatch(<4 x i32> %x, <4 x i32> %y, <5 x i1 %r = select i1 %cond, <4 x i32> %x, <4 x i32> %y ret <4 x i32> %r } - - -attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/Transforms/InstCombine/select-extractelement.ll b/llvm/test/Transforms/InstCombine/select-extractelement.ll index 621d278..6f80b7d 100644 --- a/llvm/test/Transforms/InstCombine/select-extractelement.ll +++ b/llvm/test/Transforms/InstCombine/select-extractelement.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -passes=instcombine < %s | FileCheck %s -declare void @v4float_user(<4 x float>) #0 +declare void @v4float_user(<4 x float>) -define float @extract_one_select(<4 x float> %a, <4 x float> %b, i32 %c) #0 { +define float @extract_one_select(<4 x float> %a, <4 x float> %b, i32 %c) { ; CHECK-LABEL: @extract_one_select( ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[C:%.*]], 0 ; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP_NOT]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]] @@ -17,7 +17,7 @@ define float @extract_one_select(<4 x float> %a, <4 x float> %b, i32 %c) #0 { } ; Multiple extractelements -define <2 x float> @extract_two_select(<4 x float> %a, <4 x float> %b, i32 %c) #0 { +define <2 x float> @extract_two_select(<4 x float> %a, <4 x float> %b, i32 %c) { ; CHECK-LABEL: @extract_two_select( ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[C:%.*]], 0 ; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP_NOT]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]] @@ -34,7 +34,7 @@ define <2 x float> @extract_two_select(<4 x float> %a, <4 x float> %b, i32 %c) # } ; Select has an extra non-extractelement user, don't change it -define float @extract_one_select_user(<4 x float> %a, <4 x float> %b, i32 %c) #0 { +define float @extract_one_select_user(<4 x float> %a, <4 x float> %b, i32 %c) { ; CHECK-LABEL: @extract_one_select_user( ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[C:%.*]], 0 ; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP_NOT]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]] @@ -49,7 +49,7 @@ define float @extract_one_select_user(<4 x float> %a, <4 x float> %b, i32 %c) #0 ret float %extract } -define float @extract_one_vselect_user(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 { +define float @extract_one_vselect_user(<4 x float> %a, <4 x float> %b, <4 x i32> %c) { ; CHECK-LABEL: @extract_one_vselect_user( ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq <4 x i32> [[C:%.*]], zeroinitializer ; CHECK-NEXT: [[SEL:%.*]] = select <4 x i1> [[CMP_NOT]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]] @@ -67,7 +67,7 @@ define float @extract_one_vselect_user(<4 x float> %a, <4 x float> %b, <4 x i32> ; Do not convert the vector select into a scalar select. That would increase ; the instruction count and potentially obfuscate a vector min/max idiom. -define float @extract_one_vselect(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 { +define float @extract_one_vselect(<4 x float> %a, <4 x float> %b, <4 x i32> %c) { ; CHECK-LABEL: @extract_one_vselect( ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq <4 x i32> [[C:%.*]], zeroinitializer ; CHECK-NEXT: [[SELECT:%.*]] = select <4 x i1> [[CMP_NOT]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]] @@ -81,7 +81,7 @@ define float @extract_one_vselect(<4 x float> %a, <4 x float> %b, <4 x i32> %c) } ; Multiple extractelements from a vector select -define <2 x float> @extract_two_vselect(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 { +define <2 x float> @extract_two_vselect(<4 x float> %a, <4 x float> %b, <4 x i32> %c) { ; CHECK-LABEL: @extract_two_vselect( ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq <4 x i32> [[C:%.*]], zeroinitializer ; CHECK-NEXT: [[SEL:%.*]] = select <4 x i1> [[CMP_NOT]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]] @@ -100,7 +100,7 @@ define <2 x float> @extract_two_vselect(<4 x float> %a, <4 x float> %b, <4 x i32 ; The vector selects are not decomposed into scalar selects because that would increase ; the instruction count. Extract+insert is converted to non-lane-crossing shuffles. ; Test multiple extractelements -define <4 x float> @simple_vector_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 { +define <4 x float> @simple_vector_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c) { ; CHECK-LABEL: @simple_vector_select( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i32> [[C:%.*]], i64 0 @@ -232,5 +232,3 @@ define i32 @inf_loop_partial_undef(<2 x i1> %a, <2 x i1> %b, <2 x i32> %x, <2 x %t11 = extractelement <2 x i32> %p, i32 0 ret i32 %t11 } - -attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/Transforms/JumpThreading/ddt-crash3.ll b/llvm/test/Transforms/JumpThreading/ddt-crash3.ll index edaade32..26ba857 100644 --- a/llvm/test/Transforms/JumpThreading/ddt-crash3.ll +++ b/llvm/test/Transforms/JumpThreading/ddt-crash3.ll @@ -9,9 +9,9 @@ target triple = "x86_64-unknown-linux-gnu" @global.2 = external local_unnamed_addr global i64, align 8 ; Function Attrs: norecurse noreturn nounwind uwtable -define void @hoge() local_unnamed_addr #0 { +define void @hoge() local_unnamed_addr { ; CHECK-LABEL: define void @hoge( -; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-SAME: ) local_unnamed_addr { ; CHECK-NEXT: [[BB:.*:]] ; CHECK-NEXT: br label %[[BB1:.*]] ; CHECK: [[BB1]]: @@ -48,8 +48,6 @@ bb27: ; preds = %bb1 br label %bb26 } -attributes #0 = { norecurse noreturn nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.ident = !{!0} !0 = !{!"clang version 7.0.0 "} diff --git a/llvm/test/Transforms/LICM/volatile-alias.ll b/llvm/test/Transforms/LICM/volatile-alias.ll index 410f3be..35e8844 100644 --- a/llvm/test/Transforms/LICM/volatile-alias.ll +++ b/llvm/test/Transforms/LICM/volatile-alias.ll @@ -10,7 +10,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; Function Attrs: nounwind uwtable -define i32 @foo(ptr dereferenceable(4) nonnull %p, ptr %q, i32 %n) #0 { +define i32 @foo(ptr dereferenceable(4) nonnull %p, ptr %q, i32 %n) { entry: %p.addr = alloca ptr, align 8 %q.addr = alloca ptr, align 8 @@ -51,5 +51,3 @@ for.end: ; preds = %for.cond %8 = load i32, ptr %s, align 4 ret i32 %8 } - -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/Transforms/LoopRotate/noalias.ll b/llvm/test/Transforms/LoopRotate/noalias.ll index f709bba..d6b2414 100644 --- a/llvm/test/Transforms/LoopRotate/noalias.ll +++ b/llvm/test/Transforms/LoopRotate/noalias.ll @@ -146,11 +146,7 @@ for.end: ; preds = %for.cond } ; Function Attrs: inaccessiblememonly nounwind -declare void @llvm.experimental.noalias.scope.decl(metadata) #1 - -attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { inaccessiblememonly nounwind } -attributes #2 = { nounwind readnone speculatable } +declare void @llvm.experimental.noalias.scope.decl(metadata) !llvm.module.flags = !{!0} !llvm.ident = !{!1} diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll index 5423368..25ded4b 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll @@ -157,4 +157,4 @@ bb: br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph } -attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hawaii" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hawaii" "use-soft-float"="false" } diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/pr17473.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/pr17473.ll index 6c34d1b..a878fc2 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/X86/pr17473.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/X86/pr17473.ll @@ -62,6 +62,6 @@ for.end: ; preds = %fn3.exit ; Function Attrs: nounwind optsize declare i32 @printf(ptr nocapture readonly, ...) #1 -attributes #0 = { nounwind optsize ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind optsize ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } +attributes #1 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } attributes #2 = { nounwind optsize } diff --git a/llvm/test/Transforms/LoopStrengthReduce/pr18165.ll b/llvm/test/Transforms/LoopStrengthReduce/pr18165.ll index 914ea7a..9c20685 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/pr18165.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/pr18165.ll @@ -71,8 +71,8 @@ fn1.exit: ; preds = %lor.end.i ; Function Attrs: nounwind optsize declare i32 @printf(ptr nocapture readonly, ...) #1 -attributes #0 = { nounwind optsize ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind optsize ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } +attributes #1 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } attributes #2 = { nounwind optsize } !llvm.ident = !{!0} diff --git a/llvm/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll b/llvm/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll index 3364465..37e2f68 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll @@ -46,7 +46,7 @@ for.body3: ; preds = %for.body3, %for.bod br i1 %exitcond, label %for.cond.loopexit, label %for.body3 } -attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" } !0 = !{!1, !2, i64 16} !1 = !{!"planet", !2, i64 0, !2, i64 8, !2, i64 16, !2, i64 24, !2, i64 32, !2, i64 40, !2, i64 48} diff --git a/llvm/test/Transforms/LoopUnroll/runtime-epilog-debuginfo.ll b/llvm/test/Transforms/LoopUnroll/runtime-epilog-debuginfo.ll index ee28aa1..df4dfa6 100644 --- a/llvm/test/Transforms/LoopUnroll/runtime-epilog-debuginfo.ll +++ b/llvm/test/Transforms/LoopUnroll/runtime-epilog-debuginfo.ll @@ -76,7 +76,7 @@ lee1.exit: ; preds = %lee1.exit.loopexit, ; Function Attrs: nounwind readnone declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1 -attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="arm7tdmi" "target-features"="+neon,+strict-align,+vfp3,-crypto,-fp-armv8,-fp16,-vfp4" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="arm7tdmi" "target-features"="+neon,+strict-align,+vfp3,-crypto,-fp-armv8,-fp16,-vfp4" "use-soft-float"="false" } attributes #1 = { nounwind readnone } !llvm.dbg.cu = !{!0} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll index 27ca414..199203a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll @@ -86,7 +86,8 @@ define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 { ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost for VF 16: 41 +; CHECK: Cost of 1 for VF 16: EXPRESSION vp<%11> = ir<%sum> + partial.reduce.add (mul nuw nsw (ir<%1> zext to i64), (ir<%0> zext to i64)) +; CHECK: Cost for VF 16: 3 ; CHECK: LV: Selecting VF: 16 entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll b/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll index 2d15431..8109d068 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll @@ -6,8 +6,7 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-none-unknown-elf" define i32 @dotp(ptr %a, ptr %b) #0 { -; CHECK-REGS-VP-NOT: LV(REG): Not considering vector loop of width vscale x 16 because it uses too many registers -; CHECK-REGS-VP: LV: Selecting VF: vscale x 8. +; CHECK-REGS-VP: LV: Selecting VF: vscale x 16. ; ; CHECK-NOREGS-VP: LV(REG): Not considering vector loop of width vscale x 8 because it uses too many registers ; CHECK-NOREGS-VP: LV(REG): Not considering vector loop of width vscale x 16 because it uses too many registers diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll index 3dfa6df..287226f 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll @@ -31,9 +31,9 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 ; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]] ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]]) +; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]] ; CHECK-NEON-NEXT: [[TMP13:%.*]] = sub <16 x i32> zeroinitializer, [[TMP12]] ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP13]]) @@ -52,37 +52,34 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 ; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 ; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 -; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-SVE: vector.ph: -; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE: vector.body: ; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]] -; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1 -; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32> -; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32> -; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32> -; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]] -; CHECK-SVE-NEXT: [[TMP17:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]] -; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]] -; CHECK-SVE-NEXT: [[TMP19]] = sub <vscale x 4 x i32> [[TMP17]], [[TMP18]] -; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 +; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]]) +; CHECK-SVE-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP6]] +; CHECK-SVE-NEXT: [[TMP12:%.*]] = sub <16 x i32> zeroinitializer, [[TMP11]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP12]]) +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-SVE: middle.block: -; CHECK-SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]]) +; CHECK-SVE-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]]) ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK-SVE: scalar.ph: @@ -114,10 +111,10 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1 ; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32> ; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32> -; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32> ; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]] ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP16]]) -; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32> +; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP11]] ; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = sub <vscale x 8 x i32> zeroinitializer, [[TMP17]] ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP18]]) ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] @@ -184,9 +181,9 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 ; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]] ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]]) +; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]] ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]]) ; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 @@ -221,9 +218,9 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 ; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-SVE-NEXT: [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]] ; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]]) +; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-SVE-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP5]] ; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP10]]) ; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 @@ -262,10 +259,10 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1 ; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32> ; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32> -; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32> ; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]] ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP16]]) -; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32> +; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP11]] ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP17]]) ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -331,11 +328,11 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 ; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]] ; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP10]] ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) -; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]] +; CHECK-NEON-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP13]] ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP12]]) ; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEON-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -352,37 +349,34 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 ; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 ; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 -; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-SVE: vector.ph: -; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE: vector.body: ; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]] -; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1 -; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32> -; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32> -; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32> -; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]] -; CHECK-SVE-NEXT: [[TMP17:%.*]] = sub <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]] -; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]] -; CHECK-SVE-NEXT: [[TMP19]] = add <vscale x 4 x i32> [[TMP17]], [[TMP18]] -; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 +; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]] +; CHECK-SVE-NEXT: [[TMP6:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP5]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]]) +; CHECK-SVE-NEXT: [[TMP11:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP11]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP12]]) +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-SVE: middle.block: -; CHECK-SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]]) +; CHECK-SVE-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]]) ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK-SVE: scalar.ph: @@ -414,11 +408,11 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1 ; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32> ; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32> -; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32> ; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]] ; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub nsw <vscale x 8 x i32> zeroinitializer, [[TMP16]] ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP17]]) -; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32> +; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP12]] ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP18]]) ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -486,11 +480,11 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 ; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]] ; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP10]] ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) -; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]] +; CHECK-NEON-NEXT: [[TMP16:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP16]] ; CHECK-NEON-NEXT: [[TMP13:%.*]] = sub <16 x i32> zeroinitializer, [[TMP12]] ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP13]]) ; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 @@ -508,37 +502,35 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 ; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 ; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 -; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-SVE: vector.ph: -; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE: vector.body: ; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]] -; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1 -; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32> -; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32> -; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32> -; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]] -; CHECK-SVE-NEXT: [[TMP17:%.*]] = sub <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]] -; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]] -; CHECK-SVE-NEXT: [[TMP19]] = sub <vscale x 4 x i32> [[TMP17]], [[TMP18]] -; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 +; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]] +; CHECK-SVE-NEXT: [[TMP6:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP5]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]]) +; CHECK-SVE-NEXT: [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP10]] +; CHECK-SVE-NEXT: [[TMP13:%.*]] = sub <16 x i32> zeroinitializer, [[TMP12]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP13]]) +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-SVE: middle.block: -; CHECK-SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]]) +; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]]) ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK-SVE: scalar.ph: @@ -570,11 +562,11 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1 ; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32> ; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32> -; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32> ; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]] ; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub nsw <vscale x 8 x i32> zeroinitializer, [[TMP16]] ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP17]]) -; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32> +; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP12]] ; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = sub <vscale x 8 x i32> zeroinitializer, [[TMP18]] ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP19]]) ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] @@ -644,12 +636,12 @@ define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 ; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]] ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]]) -; CHECK-NEON-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]] +; CHECK-NEON-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP15]] ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]]) -; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP9]] +; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP15]] ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE4]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE3]], <16 x i32> [[TMP12]]) ; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEON-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -683,9 +675,9 @@ define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 ; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-SVE-NEXT: [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]] ; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]]) +; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-SVE-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP5]] ; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]]) ; CHECK-SVE-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP4]], [[TMP5]] @@ -726,12 +718,12 @@ define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1 ; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32> ; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32> -; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32> ; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]] ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP16]]) -; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32> +; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP11]] ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP17]]) -; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP14]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP14]], [[TMP11]] ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE4]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE3]], <vscale x 8 x i32> [[TMP18]]) ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -802,13 +794,13 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 ; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]] ; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP10]] ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) -; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]] +; CHECK-NEON-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP13]] ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP12]]) -; CHECK-NEON-NEXT: [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP9]] +; CHECK-NEON-NEXT: [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP13]] ; CHECK-NEON-NEXT: [[TMP15:%.*]] = sub <16 x i32> zeroinitializer, [[TMP14]] ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE4]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE3]], <16 x i32> [[TMP15]]) ; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 @@ -826,39 +818,37 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 ; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 ; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 -; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-SVE: vector.ph: -; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE: vector.body: ; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]] -; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1 -; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32> -; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32> -; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32> -; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]] -; CHECK-SVE-NEXT: [[TMP17:%.*]] = sub <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]] -; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]] -; CHECK-SVE-NEXT: [[TMP19:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[TMP18]] -; CHECK-SVE-NEXT: [[TMP20:%.*]] = mul nsw <vscale x 4 x i32> [[TMP14]], [[TMP15]] -; CHECK-SVE-NEXT: [[TMP21]] = sub <vscale x 4 x i32> [[TMP19]], [[TMP20]] -; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 +; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]] +; CHECK-SVE-NEXT: [[TMP6:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP5]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]]) +; CHECK-SVE-NEXT: [[TMP11:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP13:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP11]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP13]]) +; CHECK-SVE-NEXT: [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP4]], [[TMP11]] +; CHECK-SVE-NEXT: [[TMP10:%.*]] = sub <16 x i32> zeroinitializer, [[TMP14]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE4]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE3]], <16 x i32> [[TMP10]]) +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-SVE-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK-SVE: middle.block: -; CHECK-SVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP21]]) +; CHECK-SVE-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE4]]) ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK-SVE: scalar.ph: @@ -890,13 +880,13 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1 ; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32> ; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32> -; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32> ; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]] ; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub nsw <vscale x 8 x i32> zeroinitializer, [[TMP16]] ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP17]]) -; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32> +; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP12]] ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP18]]) -; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = mul nsw <vscale x 8 x i32> [[TMP14]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = mul nsw <vscale x 8 x i32> [[TMP14]], [[TMP12]] ; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = sub <vscale x 8 x i32> zeroinitializer, [[TMP19]] ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE4]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE3]], <vscale x 8 x i32> [[TMP20]]) ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] @@ -969,9 +959,9 @@ define i32 @chained_partial_reduce_madd_extadd(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 ; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]] ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]]) +; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP9]]) ; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEON-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1005,9 +995,9 @@ define i32 @chained_partial_reduce_madd_extadd(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 ; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-SVE-NEXT: [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]] ; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]]) +; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP5]]) ; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-SVE-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1045,9 +1035,9 @@ define i32 @chained_partial_reduce_madd_extadd(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1 ; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32> ; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32> -; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32> ; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]] ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP16]]) +; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32> ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP15]]) ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1110,8 +1100,8 @@ define i32 @chained_partial_reduce_extadd_extadd(ptr %a, ptr %b, i32 %N) #0 { ; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 ; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 ; CHECK-NEON-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-NEON-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]]) +; CHECK-NEON-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP6]]) ; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEON-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1142,8 +1132,8 @@ define i32 @chained_partial_reduce_extadd_extadd(ptr %a, ptr %b, i32 %N) #0 { ; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 ; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 ; CHECK-SVE-NEXT: [[TMP2:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> ; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP2]]) +; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> ; CHECK-SVE-NEXT: [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP3]]) ; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-SVE-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1178,8 +1168,8 @@ define i32 @chained_partial_reduce_extadd_extadd(ptr %a, ptr %b, i32 %N) #0 { ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1 ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1 ; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32> -; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32> ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP11]]) +; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32> ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE2]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP12]]) ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1240,10 +1230,10 @@ define i32 @chained_partial_reduce_extadd_madd(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 ; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 ; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 -; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP9]]) +; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]] ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP10]]) ; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 @@ -1276,10 +1266,10 @@ define i32 @chained_partial_reduce_extadd_madd(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 ; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 ; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 -; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> ; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]]) +; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> ; CHECK-SVE-NEXT: [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]] ; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP6]]) ; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 @@ -1316,10 +1306,10 @@ define i32 @chained_partial_reduce_extadd_madd(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1 ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1 ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1 -; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32> -; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32> ; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32> ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP15]]) +; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32> +; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32> ; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]] ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP16]]) ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll index b033f60..b430efc 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll @@ -467,3 +467,83 @@ loop: exit: ret i32 %red.next } + +define i64 @partial_reduction_mul_two_users(i64 %n, ptr %a, i16 %b, i32 %c) { +; CHECK-LABEL: define i64 @partial_reduction_mul_two_users( +; CHECK-SAME: i64 [[N:%.*]], ptr [[A:%.*]], i16 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[B]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT]] to <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i32> [[TMP1]], [[TMP1]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr [[A]], align 2 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i64> @llvm.vector.partial.reduce.add.v4i64.v8i64(<4 x i64> [[VEC_PHI]], <8 x i64> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i32> [[TMP5]] to <8 x i64> +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[PARTIAL_REDUCE]]) +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP6]], i32 7 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RES1:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[LOAD_EXT_EXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RES2:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[LOAD:%.*]] = load i16, ptr [[A]], align 2 +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[B]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[CONV]], [[CONV]] +; CHECK-NEXT: [[MUL_EXT:%.*]] = zext i32 [[MUL]] to i64 +; CHECK-NEXT: [[ADD]] = add i64 [[RES2]], [[MUL_EXT]] +; CHECK-NEXT: [[OR:%.*]] = or i32 [[MUL]], [[C]] +; CHECK-NEXT: [[LOAD_EXT:%.*]] = sext i16 [[LOAD]] to i32 +; CHECK-NEXT: [[LOAD_EXT_EXT]] = sext i32 [[LOAD_EXT]] to i64 +; CHECK-NEXT: [[EXITCOND740_NOT:%.*]] = icmp eq i64 [[IV]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND740_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i64 [[ADD_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %res1 = phi i64 [ 0, %entry ], [ %load.ext.ext, %loop ] + %res2 = phi i64 [ 0, %entry ], [ %add, %loop ] + %load = load i16, ptr %a, align 2 + %iv.next = add i64 %iv, 1 + %conv = sext i16 %b to i32 + %mul = mul i32 %conv, %conv + %mul.ext = zext i32 %mul to i64 + %add = add i64 %res2, %mul.ext + %second_use = or i32 %mul, %c ; this value is otherwise unused, but that's sufficient for the test + %load.ext = sext i16 %load to i32 + %load.ext.ext = sext i32 %load.ext to i64 + %exitcond740.not = icmp eq i64 %iv, %n + br i1 %exitcond740.not, label %exit, label %loop + +exit: + ret i64 %add +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll index 8ece59a..d8f1a86 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll @@ -16,10 +16,10 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 -; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1 ; CHECK-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP8]], [[TMP5]] ; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP9]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 @@ -68,7 +68,6 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 { ; CHECK-NEXT: [[IV_NEXT:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = sext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32> ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -77,11 +76,12 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 { ; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <16 x i8> poison, i8 [[TMP2]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT2]], <16 x i8> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[BROADCAST_SPLAT3]] to <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = sext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], [[TMP1]] ; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[IV_NEXT]] -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[IV_NEXT]] @@ -89,7 +89,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 { ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[IND_END6:%.*]] = add i64 [[IDX_NEG]], [[IV_NEXT]] ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF4:![0-9]+]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT]], [[WHILE_BODY]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP6]], [[WHILE_BODY]] ], [ 0, [[ENTRY]] ] @@ -112,7 +112,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 { ; CHECK-NEXT: [[TMP13]] = add <4 x i32> [[TMP14]], [[VEC_PHI9]] ; CHECK-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX9]], 4 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC5]] -; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]]) ; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]] @@ -136,7 +136,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 { ; CHECK-NEXT: [[CMP_IV_NEG:%.*]] = icmp ugt i64 [[IV_NEG]], 0 ; CHECK-NEXT: [[CMP_IV:%.*]] = icmp ne i64 [[ACCUM1]], -1 ; CHECK-NEXT: [[EXITCOND:%.*]] = and i1 [[CMP_IV_NEG]], [[CMP_IV]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[WHILE_BODY1]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[WHILE_BODY1]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: while.end.loopexit: ; CHECK-NEXT: [[RESULT:%.*]] = phi i32 [ [[ADD]], [[WHILE_BODY1]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret void @@ -495,7 +495,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) { ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16) ; CHECK-NEXT: [[TMP181:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP182:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-NEXT: br label [[EXIT:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll index 09b41fb..26e630f 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll @@ -26,26 +26,26 @@ define i32 @sudot(ptr %a, ptr %b) #0 { ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP6]], align 1 ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1 -; CHECK-NEXT: [[TMP11:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 4 ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]] ; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP13]], align 1 ; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP17]], align 1 -; CHECK-NEXT: [[TMP12:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = mul <vscale x 16 x i32> [[TMP12]], [[TMP11]] -; CHECK-NEXT: [[TMP19:%.*]] = mul <vscale x 16 x i32> [[TMP18]], [[TMP7]] -; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP14]]) -; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP19]]) +; CHECK-NEXT: [[TMP18:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = mul <vscale x 16 x i32> [[TMP18]], [[TMP11]] +; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP12]]) +; CHECK-NEXT: [[TMP19:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = mul <vscale x 16 x i32> [[TMP19]], [[TMP14]] +; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP20]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] -; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]]) +; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] ; CHECK: scalar.ph: @@ -62,8 +62,8 @@ define i32 @sudot(ptr %a, ptr %b) #0 { ; CHECK-NOI8MM-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-NOI8MM: vector.body: ; CHECK-NOI8MM-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] -; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] ; CHECK-NOI8MM-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-NOI8MM-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NOI8MM-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3 @@ -71,25 +71,25 @@ define i32 @sudot(ptr %a, ptr %b) #0 { ; CHECK-NOI8MM-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP6]], align 1 ; CHECK-NOI8MM-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1 ; CHECK-NOI8MM-NEXT: [[TMP11:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32> -; CHECK-NOI8MM-NEXT: [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32> +; CHECK-NOI8MM-NEXT: [[TMP7:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32> ; CHECK-NOI8MM-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-NOI8MM-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NOI8MM-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 3 ; CHECK-NOI8MM-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]] ; CHECK-NOI8MM-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1 ; CHECK-NOI8MM-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1 -; CHECK-NOI8MM-NEXT: [[TMP18:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32> -; CHECK-NOI8MM-NEXT: [[TMP19:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32> -; CHECK-NOI8MM-NEXT: [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP18]], [[TMP11]] -; CHECK-NOI8MM-NEXT: [[TMP21:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP12]] -; CHECK-NOI8MM-NEXT: [[TMP22]] = add <vscale x 8 x i32> [[TMP20]], [[VEC_PHI]] -; CHECK-NOI8MM-NEXT: [[TMP23]] = add <vscale x 8 x i32> [[TMP21]], [[VEC_PHI1]] +; CHECK-NOI8MM-NEXT: [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32> +; CHECK-NOI8MM-NEXT: [[TMP21:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32> +; CHECK-NOI8MM-NEXT: [[TMP14:%.*]] = mul <vscale x 8 x i32> [[TMP12]], [[TMP11]] +; CHECK-NOI8MM-NEXT: [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP21]], [[TMP7]] +; CHECK-NOI8MM-NEXT: [[TMP18]] = add <vscale x 8 x i32> [[TMP14]], [[VEC_PHI]] +; CHECK-NOI8MM-NEXT: [[TMP20]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]] ; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-NOI8MM-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NOI8MM-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-NOI8MM: middle.block: -; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP23]], [[TMP22]] -; CHECK-NOI8MM-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]]) +; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP20]], [[TMP18]] +; CHECK-NOI8MM-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]]) ; CHECK-NOI8MM-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NOI8MM-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] ; CHECK-NOI8MM: scalar.ph: @@ -137,26 +137,26 @@ define i32 @usdot(ptr %a, ptr %b) #0 { ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP6]], align 1 ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1 -; CHECK-NEXT: [[TMP11:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 4 ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]] ; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP13]], align 1 ; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP17]], align 1 -; CHECK-NEXT: [[TMP12:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = mul <vscale x 16 x i32> [[TMP12]], [[TMP11]] -; CHECK-NEXT: [[TMP19:%.*]] = mul <vscale x 16 x i32> [[TMP18]], [[TMP7]] -; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP14]]) -; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP19]]) +; CHECK-NEXT: [[TMP18:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = mul <vscale x 16 x i32> [[TMP18]], [[TMP11]] +; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP12]]) +; CHECK-NEXT: [[TMP19:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = mul <vscale x 16 x i32> [[TMP19]], [[TMP14]] +; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP20]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] -; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]]) +; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] ; CHECK: scalar.ph: @@ -173,8 +173,8 @@ define i32 @usdot(ptr %a, ptr %b) #0 { ; CHECK-NOI8MM-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-NOI8MM: vector.body: ; CHECK-NOI8MM-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] -; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] ; CHECK-NOI8MM-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-NOI8MM-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NOI8MM-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3 @@ -182,25 +182,25 @@ define i32 @usdot(ptr %a, ptr %b) #0 { ; CHECK-NOI8MM-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP6]], align 1 ; CHECK-NOI8MM-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1 ; CHECK-NOI8MM-NEXT: [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32> -; CHECK-NOI8MM-NEXT: [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32> +; CHECK-NOI8MM-NEXT: [[TMP7:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32> ; CHECK-NOI8MM-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-NOI8MM-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NOI8MM-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 3 ; CHECK-NOI8MM-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]] ; CHECK-NOI8MM-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1 ; CHECK-NOI8MM-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1 -; CHECK-NOI8MM-NEXT: [[TMP18:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32> -; CHECK-NOI8MM-NEXT: [[TMP19:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32> -; CHECK-NOI8MM-NEXT: [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP18]], [[TMP11]] -; CHECK-NOI8MM-NEXT: [[TMP21:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP12]] -; CHECK-NOI8MM-NEXT: [[TMP22]] = add <vscale x 8 x i32> [[TMP20]], [[VEC_PHI]] -; CHECK-NOI8MM-NEXT: [[TMP23]] = add <vscale x 8 x i32> [[TMP21]], [[VEC_PHI1]] +; CHECK-NOI8MM-NEXT: [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32> +; CHECK-NOI8MM-NEXT: [[TMP21:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32> +; CHECK-NOI8MM-NEXT: [[TMP14:%.*]] = mul <vscale x 8 x i32> [[TMP12]], [[TMP11]] +; CHECK-NOI8MM-NEXT: [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP21]], [[TMP7]] +; CHECK-NOI8MM-NEXT: [[TMP18]] = add <vscale x 8 x i32> [[TMP14]], [[VEC_PHI]] +; CHECK-NOI8MM-NEXT: [[TMP20]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]] ; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-NOI8MM-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NOI8MM-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-NOI8MM: middle.block: -; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP23]], [[TMP22]] -; CHECK-NOI8MM-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]]) +; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP20]], [[TMP18]] +; CHECK-NOI8MM-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]]) ; CHECK-NOI8MM-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NOI8MM-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] ; CHECK-NOI8MM: scalar.ph: @@ -242,18 +242,18 @@ define i32 @sudot_neon(ptr %a, ptr %b) #1 { ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1 ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 16 ; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 ; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 -; CHECK-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = mul <16 x i32> [[TMP8]], [[TMP3]] +; CHECK-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]] -; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]]) -; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP11]]) +; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) +; CHECK-NEXT: [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = mul <16 x i32> [[TMP10]], [[TMP8]] +; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP14]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -337,18 +337,18 @@ define i32 @usdot_neon(ptr %a, ptr %b) #1 { ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1 ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 16 ; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 ; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 -; CHECK-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = mul <16 x i32> [[TMP8]], [[TMP3]] +; CHECK-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]] -; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]]) -; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP11]]) +; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) +; CHECK-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = mul <16 x i32> [[TMP10]], [[TMP8]] +; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP14]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll index 801eb81..b847631 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll @@ -18,10 +18,10 @@ define i32 @dotp(ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 @@ -47,17 +47,17 @@ define i32 @dotp(ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]] -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]] ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul <16 x i32> [[TMP7]], [[TMP10]] ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 @@ -80,10 +80,10 @@ define i32 @dotp(ptr %a, ptr %b) { ; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 ; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]]) ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 @@ -720,26 +720,26 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] ; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]] ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]]) ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]] ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]]) ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]] ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]]) ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]] ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]]) @@ -788,59 +788,59 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP38]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP16]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP14]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE1]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP19]]) -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP20]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE1]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP20]]) +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP19]], [[TMP17]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP21]]) ; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD15:%.*]] = load <16 x i8>, ptr [[TMP26]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = sext <16 x i8> [[WIDE_LOAD14]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP27]] +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD14]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul nsw <16 x i32> [[TMP24]], [[TMP28]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP29]]) -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP30]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP30]]) +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = mul nsw <16 x i32> [[TMP27]], [[TMP25]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP29]]) ; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD18:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD19:%.*]] = load <16 x i8>, ptr [[TMP32]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = sext <16 x i8> [[WIDE_LOAD18]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD20:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD21:%.*]] = load <16 x i8>, ptr [[TMP36]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = sext <16 x i8> [[WIDE_LOAD20]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = mul nsw <16 x i32> [[TMP33]], [[TMP37]] +; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = sext <16 x i8> [[WIDE_LOAD18]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = sext <16 x i8> [[WIDE_LOAD20]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = mul nsw <16 x i32> [[TMP34]], [[TMP56]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP39]]) -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP40]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP40]]) +; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = mul nsw <16 x i32> [[TMP35]], [[TMP33]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP37]]) ; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD24:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD25:%.*]] = load <16 x i8>, ptr [[TMP42]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = sext <16 x i8> [[WIDE_LOAD24]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = sext <16 x i8> [[WIDE_LOAD25]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD26:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD27:%.*]] = load <16 x i8>, ptr [[TMP46]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = sext <16 x i8> [[WIDE_LOAD26]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = mul nsw <16 x i32> [[TMP43]], [[TMP47]] +; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = sext <16 x i8> [[WIDE_LOAD24]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = sext <16 x i8> [[WIDE_LOAD26]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = mul nsw <16 x i32> [[TMP44]], [[TMP48]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP49]]) -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP50]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP50]]) +; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = sext <16 x i8> [[WIDE_LOAD25]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = mul nsw <16 x i32> [[TMP43]], [[TMP41]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP45]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP51]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -884,26 +884,26 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) { ; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] ; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] ; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-MAXBW-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 ; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]] ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]]) ; CHECK-MAXBW-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 -; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> ; CHECK-MAXBW-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> ; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32> ; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]] ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]]) ; CHECK-MAXBW-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 -; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> ; CHECK-MAXBW-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> ; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32> ; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]] ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]]) ; CHECK-MAXBW-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 -; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32> ; CHECK-MAXBW-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1 +; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32> ; CHECK-MAXBW-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32> ; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]] ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]]) @@ -2025,7 +2025,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[VEC_PHI]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]]) ; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15 @@ -2062,7 +2062,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[TMP14]] = add <16 x i32> [[TMP12]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP14]], [[TMP13]] ; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) @@ -2091,7 +2091,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) { ; CHECK-MAXBW-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[VEC_PHI]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-MAXBW-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]]) ; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll index 1ace7d4..4636c1b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -18,11 +18,11 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX1]] ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP16]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX1]] ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP20]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP2]], [[TMP3]] ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 1024 @@ -47,17 +47,17 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP20]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP20]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX1]] -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP28]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP28]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP28]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = mul <16 x i32> [[TMP4]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]]) ; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = mul <16 x i32> [[TMP6]], [[TMP2]] -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP8]]) +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP8]] ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP9]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 1024 @@ -75,26 +75,26 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: ; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 ; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: ; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1 -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32> +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1 ; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1 -; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32> -; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP20]], [[TMP13]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP22]]) +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1 +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32> +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32> +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = mul <vscale x 16 x i32> [[TMP4]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP6]]) ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-MAXBW: middle.block: -; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE5]]) +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] ; CHECK-MAXBW: scalar.ph: @@ -134,10 +134,10 @@ define i64 @not_dotp_i8_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %b ; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64> -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = mul nuw nsw <16 x i64> [[TMP1]], [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64> +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = mul nuw nsw <16 x i64> [[TMP0]], [[TMP1]] ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP2]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 @@ -157,54 +157,78 @@ define i64 @not_dotp_i8_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %b ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE6:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE14:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 32 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 48 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = mul nuw nsw <16 x i64> [[TMP1]], [[TMP0]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP2]]) -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD7:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 32 +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 48 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP18]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = mul nuw nsw <16 x i64> [[TMP2]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP4]]) +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul nuw nsw <16 x i64> [[TMP5]], [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE6]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP7]]) +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <16 x i8> [[WIDE_LOAD10]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = mul nuw nsw <16 x i64> [[TMP12]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE13]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI2]], <16 x i64> [[TMP14]]) +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext <16 x i8> [[WIDE_LOAD11]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = zext <16 x i8> [[WIDE_LOAD7]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul nuw nsw <16 x i64> [[TMP15]], [[TMP16]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE14]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI3]], <16 x i64> [[TMP17]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE6]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX15:%.*]] = add <2 x i64> [[PARTIAL_REDUCE13]], [[BIN_RDX]] +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX16:%.*]] = add <2 x i64> [[PARTIAL_REDUCE14]], [[BIN_RDX15]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX16]]) ; CHECK-INTERLEAVED-NEXT: br label [[FOR_EXIT:%.*]] ; CHECK-INTERLEAVED: for.exit: -; CHECK-INTERLEAVED-NEXT: ret i64 [[TMP4]] +; CHECK-INTERLEAVED-NEXT: ret i64 [[TMP9]] ; ; CHECK-MAXBW-LABEL: define i64 @not_dotp_i8_to_i64_has_neon_dotprod( ; CHECK-MAXBW-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-MAXBW-NEXT: entry: ; CHECK-MAXBW-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 -; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]] -; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[B]], i64 [[N_VEC]] ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: ; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-MAXBW-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[NEXT_GEP]], align 1 -; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i64> -; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[NEXT_GEP1]], align 1 -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i64> -; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = mul nuw nsw <vscale x 8 x i64> [[TMP13]], [[TMP11]] -; CHECK-MAXBW-NEXT: [[TMP15]] = add <vscale x 8 x i64> [[TMP14]], [[VEC_PHI]] -; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] -; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1 +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64> +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = mul nuw nsw <16 x i64> [[TMP0]], [[TMP1]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP2]]) +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-MAXBW-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-MAXBW: middle.block: -; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> [[TMP15]]) -; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) +; CHECK-MAXBW-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: ret i64 [[TMP4]] ; entry: br label %for.body @@ -245,10 +269,10 @@ define i64 @not_dotp_i16_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly % ; CHECK-INTERLEAVE1-NEXT: [[OFFSET_IDX1:%.*]] = mul i64 [[INDEX]], 2 ; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX1]] ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i16>, ptr [[NEXT_GEP2]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD3]] to <8 x i64> -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = mul nuw nsw <8 x i64> [[TMP1]], [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = zext <8 x i16> [[WIDE_LOAD3]] to <8 x i64> +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = mul nuw nsw <8 x i64> [[TMP0]], [[TMP1]] ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP2]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 @@ -269,30 +293,50 @@ define i64 @not_dotp_i16_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly % ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE14:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE15:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 ; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[OFFSET_IDX]] ; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX2:%.*]] = mul i64 [[INDEX]], 2 ; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX2]] ; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 24 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[WIDE_LOAD4]] to <8 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x i16>, ptr [[TMP10]], align 2 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i16>, ptr [[TMP11]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i32 24 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i16>, ptr [[NEXT_GEP3]], align 2 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <8 x i16> [[WIDE_LOAD5]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD11:%.*]] = load <8 x i16>, ptr [[TMP18]], align 2 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i16>, ptr [[TMP19]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[WIDE_LOAD5]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = mul nuw nsw <8 x i64> [[TMP2]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP4]]) ; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD6]] to <8 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = mul nuw nsw <8 x i64> [[TMP4]], [[TMP1]] -; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul nuw nsw <8 x i64> [[TMP5]], [[TMP2]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP6]]) +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <8 x i16> [[WIDE_LOAD4]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul nuw nsw <8 x i64> [[TMP5]], [[TMP6]] ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI1]], <8 x i64> [[TMP7]]) -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <8 x i16> [[WIDE_LOAD11]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext <8 x i16> [[WIDE_LOAD7]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = mul nuw nsw <8 x i64> [[TMP12]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE14]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI2]], <8 x i64> [[TMP14]]) +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext <8 x i16> [[WIDE_LOAD12]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = zext <8 x i16> [[WIDE_LOAD8]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul nuw nsw <8 x i64> [[TMP15]], [[TMP16]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE15]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI3]], <8 x i64> [[TMP17]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE7]], [[PARTIAL_REDUCE]] -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX16:%.*]] = add <2 x i64> [[PARTIAL_REDUCE14]], [[BIN_RDX]] +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX17:%.*]] = add <2 x i64> [[PARTIAL_REDUCE15]], [[BIN_RDX16]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX17]]) ; CHECK-INTERLEAVED-NEXT: br label [[FOR_EXIT:%.*]] ; CHECK-INTERLEAVED: for.exit: ; CHECK-INTERLEAVED-NEXT: ret i64 [[TMP9]] @@ -302,36 +346,28 @@ define i64 @not_dotp_i16_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly % ; CHECK-MAXBW-NEXT: entry: ; CHECK-MAXBW-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = mul i64 [[N_VEC]], 2 -; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]] -; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = mul i64 [[N_VEC]], 2 -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]] ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: ; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 ; CHECK-MAXBW-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[OFFSET_IDX]] ; CHECK-MAXBW-NEXT: [[OFFSET_IDX1:%.*]] = mul i64 [[INDEX]], 2 ; CHECK-MAXBW-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX1]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[NEXT_GEP]], align 2 -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = zext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i64> -; CHECK-MAXBW-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i16>, ptr [[NEXT_GEP2]], align 2 -; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = zext <vscale x 4 x i16> [[WIDE_LOAD3]] to <vscale x 4 x i64> -; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = mul nuw nsw <vscale x 4 x i64> [[TMP15]], [[TMP13]] -; CHECK-MAXBW-NEXT: [[TMP17]] = add <vscale x 4 x i64> [[TMP16]], [[VEC_PHI]] -; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] -; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i16>, ptr [[NEXT_GEP2]], align 2 +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = zext <8 x i16> [[WIDE_LOAD3]] to <8 x i64> +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = mul nuw nsw <8 x i64> [[TMP0]], [[TMP1]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP2]]) +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-MAXBW-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK-MAXBW: middle.block: -; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP17]]) -; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) +; CHECK-MAXBW-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: ret i64 [[TMP4]] ; entry: br label %for.body @@ -687,7 +723,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-MAXBW-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-MAXBW-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP138]]) ; CHECK-MAXBW-NEXT: br label [[FOR_EXIT:%.*]] @@ -835,7 +871,7 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = mul nuw i32 [[TMP20]], 8 @@ -964,7 +1000,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = add <vscale x 8 x i32> [[TMP20]], [[TMP19]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = mul nuw i32 [[TMP23]], 8 @@ -1024,26 +1060,26 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] ; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul nsw <16 x i32> [[TMP12]], [[TMP23]] +; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP12]] ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP13]]) ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP14]], [[TMP15]] ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP16]]) ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP18]] ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP19]]) ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP21]] ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP22]]) @@ -1092,58 +1128,58 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP43]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = mul nsw <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP15]]) ; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP12]] -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = mul nsw <16 x i32> [[TMP16]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP17]]) +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = mul nsw <16 x i32> [[TMP16]], [[TMP17]] ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP18]]) ; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD15:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD14]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP23]] -; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP21]], [[TMP24]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP25]]) +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD15:%.*]] = load <16 x i8>, ptr [[TMP20]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = sext <16 x i8> [[WIDE_LOAD14]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = mul nsw <16 x i32> [[TMP21]], [[TMP22]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP23]]) +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP24]], [[TMP25]] ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP26]]) ; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD18:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD19:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD18]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD20:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD21:%.*]] = load <16 x i8>, ptr [[TMP30]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = sext <16 x i8> [[WIDE_LOAD20]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP31]] -; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP29]], [[TMP48]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE22]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP33]]) +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD21:%.*]] = load <16 x i8>, ptr [[TMP28]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = sext <16 x i8> [[WIDE_LOAD18]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD20]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP29]], [[TMP30]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE22]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP31]]) +; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP48]], [[TMP33]] ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE23]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP34]]) ; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD24:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD25:%.*]] = load <16 x i8>, ptr [[TMP35]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = sext <16 x i8> [[WIDE_LOAD24]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = sext <16 x i8> [[WIDE_LOAD25]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD26:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD27:%.*]] = load <16 x i8>, ptr [[TMP38]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = sext <16 x i8> [[WIDE_LOAD26]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = mul nsw <16 x i32> [[TMP36]], [[TMP39]] -; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = mul nsw <16 x i32> [[TMP37]], [[TMP40]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE28]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP41]]) +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD27:%.*]] = load <16 x i8>, ptr [[TMP36]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = sext <16 x i8> [[WIDE_LOAD24]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = sext <16 x i8> [[WIDE_LOAD26]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = mul nsw <16 x i32> [[TMP37]], [[TMP38]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE28]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP39]]) +; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = sext <16 x i8> [[WIDE_LOAD25]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = mul nsw <16 x i32> [[TMP40]], [[TMP41]] ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE29]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP42]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1165,21 +1201,21 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-MAXBW-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-MAXBW-NEXT: entry: ; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 ; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP1]] ; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: ; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 ; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]] ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: ; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI4:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE16:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI5:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE17:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI6:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI7:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE11:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] ; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]] ; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = or disjoint i64 [[INDEX]], 1 @@ -1191,38 +1227,38 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = or disjoint i64 [[INDEX]], 3 ; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]] ; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1 -; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32> -; CHECK-MAXBW-NEXT: [[WIDE_LOAD9:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1 -; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD9]] to <vscale x 8 x i32> -; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = mul nsw <vscale x 8 x i32> [[TMP29]], [[TMP23]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE11]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI7]], <vscale x 8 x i32> [[TMP31]]) -; CHECK-MAXBW-NEXT: [[WIDE_LOAD12:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1 -; CHECK-MAXBW-NEXT: [[TMP37:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD12]] to <vscale x 8 x i32> -; CHECK-MAXBW-NEXT: [[WIDE_LOAD14:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1 -; CHECK-MAXBW-NEXT: [[TMP43:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD14]] to <vscale x 8 x i32> -; CHECK-MAXBW-NEXT: [[TMP45:%.*]] = mul nsw <vscale x 8 x i32> [[TMP37]], [[TMP43]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI6]], <vscale x 8 x i32> [[TMP45]]) -; CHECK-MAXBW-NEXT: [[WIDE_LOAD18:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1 -; CHECK-MAXBW-NEXT: [[TMP51:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD18]] to <vscale x 8 x i32> -; CHECK-MAXBW-NEXT: [[WIDE_LOAD20:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1 -; CHECK-MAXBW-NEXT: [[TMP57:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD20]] to <vscale x 8 x i32> -; CHECK-MAXBW-NEXT: [[TMP59:%.*]] = mul nsw <vscale x 8 x i32> [[TMP51]], [[TMP57]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE17]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI5]], <vscale x 8 x i32> [[TMP59]]) -; CHECK-MAXBW-NEXT: [[WIDE_LOAD24:%.*]] = load <vscale x 8 x i8>, ptr [[TMP16]], align 1 -; CHECK-MAXBW-NEXT: [[TMP65:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD24]] to <vscale x 8 x i32> -; CHECK-MAXBW-NEXT: [[WIDE_LOAD26:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1 -; CHECK-MAXBW-NEXT: [[TMP71:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD26]] to <vscale x 8 x i32> -; CHECK-MAXBW-NEXT: [[TMP73:%.*]] = mul nsw <vscale x 8 x i32> [[TMP65]], [[TMP71]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE16]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI4]], <vscale x 8 x i32> [[TMP73]]) +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1 +; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32> +; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32> +; CHECK-MAXBW-NEXT: [[TMP33:%.*]] = mul nsw <vscale x 16 x i32> [[TMP27]], [[TMP32]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI3]], <vscale x 16 x i32> [[TMP33]]) +; CHECK-MAXBW-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1 +; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD5]] to <vscale x 16 x i32> +; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD6]] to <vscale x 16 x i32> +; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = mul nsw <vscale x 16 x i32> [[TMP18]], [[TMP19]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE7]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI2]], <vscale x 16 x i32> [[TMP20]]) +; CHECK-MAXBW-NEXT: [[WIDE_LOAD8:%.*]] = load <vscale x 16 x i8>, ptr [[TMP13]], align 1 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD9:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1 +; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD8]] to <vscale x 16 x i32> +; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD9]] to <vscale x 16 x i32> +; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = mul nsw <vscale x 16 x i32> [[TMP21]], [[TMP22]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE10]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP23]]) +; CHECK-MAXBW-NEXT: [[WIDE_LOAD11:%.*]] = load <vscale x 16 x i8>, ptr [[TMP16]], align 1 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD12:%.*]] = load <vscale x 16 x i8>, ptr [[TMP17]], align 1 +; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD11]] to <vscale x 16 x i32> +; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD12]] to <vscale x 16 x i32> +; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = mul nsw <vscale x 16 x i32> [[TMP24]], [[TMP25]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE13]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP26]]) ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP74:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK-MAXBW: middle.block: -; CHECK-MAXBW-NEXT: [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE16]]) -; CHECK-MAXBW-NEXT: [[TMP40:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE17]]) -; CHECK-MAXBW-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]]) -; CHECK-MAXBW-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE11]]) +; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE13]]) +; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE10]]) +; CHECK-MAXBW-NEXT: [[TMP30:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE7]]) +; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-MAXBW: scalar.ph: @@ -1390,7 +1426,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = xor i1 [[TMP19]], true -; CHECK-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-MAXBW-NEXT: br label [[EXIT:%.*]] @@ -1525,7 +1561,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[TMP24]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP24]]) ; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = call i32 @llvm.vscale.i32() @@ -1565,110 +1601,93 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 41, [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 41, [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP7]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64> +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[INDEX]], 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i8>, ptr [[TMP11]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD1]] to <vscale x 2 x i64> -; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP13]], [[TMP9]] -; CHECK-INTERLEAVE1-NEXT: [[TMP15]] = add <vscale x 2 x i64> [[VEC_PHI]], [[TMP14]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i64> +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul nuw nsw <16 x i64> [[TMP3]], [[TMP4]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP5]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP15]]) -; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH:%.*]] +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVE1-NEXT: br label [[SCALAR_PH:%.*]] ; CHECK-INTERLEAVE1: scalar.ph: ; ; CHECK-INTERLEAVED-LABEL: define i64 @dotp_cost_disagreement( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 41, [[TMP1]] -; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 41, [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 41, [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = shl nuw i64 [[TMP9]], 1 -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i64 [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP7]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i8>, ptr [[TMP11]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD2]] to <vscale x 2 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = add nuw nsw i64 [[INDEX]], 1 ; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP14]] -; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = shl nuw i64 [[TMP17]], 1 -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP15]], i64 [[TMP18]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i8>, ptr [[TMP15]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 2 x i8>, ptr [[TMP19]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD3]] to <vscale x 2 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD4]] to <vscale x 2 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP20]], [[TMP12]] -; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP21]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add <vscale x 2 x i64> [[VEC_PHI]], [[TMP22]] -; CHECK-INTERLEAVED-NEXT: [[TMP25]] = add <vscale x 2 x i64> [[VEC_PHI1]], [[TMP23]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP15]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP15]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul nuw nsw <16 x i64> [[TMP5]], [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP13]]) +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul nuw nsw <16 x i64> [[TMP8]], [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP10]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 2 x i64> [[TMP25]], [[TMP24]] -; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[BIN_RDX]]) -; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: br label [[SCALAR_PH:%.*]] ; CHECK-INTERLEAVED: scalar.ph: ; ; CHECK-MAXBW-LABEL: define i64 @dotp_cost_disagreement( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-MAXBW-NEXT: entry: ; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 ; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 41, [[TMP1]] ; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: ; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 ; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 41, [[TMP3]] ; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 41, [[N_MOD_VF]] ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: ; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1 -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i64> +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1 ; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[INDEX]], 1 ; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP10]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1 -; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i64> -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = mul nuw nsw <vscale x 8 x i64> [[TMP8]], [[TMP9]] -; CHECK-MAXBW-NEXT: [[TMP14]] = add <vscale x 8 x i64> [[VEC_PHI]], [[TMP13]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1 +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i64> +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i64> +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = mul nuw nsw <vscale x 16 x i64> [[TMP12]], [[TMP8]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv16i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 16 x i64> [[TMP9]]) ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK-MAXBW: middle.block: -; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> [[TMP14]]) +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-MAXBW: scalar.ph: @@ -1971,7 +1990,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-MAXBW-NEXT: [[TMP11]] = add <vscale x 4 x i64> [[TMP10]], [[VEC_PHI]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP11]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] @@ -2104,7 +2123,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-MAXBW-NEXT: [[TMP11]] = add <vscale x 4 x i64> [[TMP10]], [[VEC_PHI]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP11]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] @@ -2160,10 +2179,10 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 { ; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64> -; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = mul nuw nsw <16 x i64> [[TMP6]], [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64> +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = mul nuw nsw <16 x i64> [[TMP5]], [[TMP6]] ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP10]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -2181,10 +2200,10 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 { ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]] ; CHECK-INTERLEAVED: for.body.preheader: ; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 -; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32 ; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 32 ; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = trunc i64 [[N_VEC]] to i32 ; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]] @@ -2194,19 +2213,29 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 { ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ [[TMP4]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE6:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP2]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul nuw nsw <16 x i64> [[TMP6]], [[TMP5]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP10]]) -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[NEXT_GEP2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = mul nuw nsw <16 x i64> [[TMP13]], [[TMP15]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP16]]) +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul nuw nsw <16 x i64> [[TMP10]], [[TMP11]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE6]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP12]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE6]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVED: scalar.ph: @@ -2219,35 +2248,35 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 { ; CHECK-MAXBW: for.body.preheader: ; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 ; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3 +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 ; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] ; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: ; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8 +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 16 ; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]] ; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] ; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = trunc i64 [[N_VEC]] to i32 ; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]] ; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[N_VEC]] -; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = insertelement <vscale x 8 x i64> zeroinitializer, i64 [[COST]], i32 0 +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[COST]], i32 0 ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: ; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i64> [ [[TMP10]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ [[TMP12]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-MAXBW-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[NEXT_GEP]], align 1 -; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i64> -; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[NEXT_GEP1]], align 1 -; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i64> -; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = mul nuw nsw <vscale x 8 x i64> [[TMP16]], [[TMP14]] -; CHECK-MAXBW-NEXT: [[TMP20]] = add <vscale x 8 x i64> [[TMP17]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[NEXT_GEP]], align 1 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[NEXT_GEP1]], align 1 +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i64> +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i64> +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = mul nuw nsw <vscale x 16 x i64> [[TMP14]], [[TMP10]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv16i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 16 x i64> [[TMP11]]) ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] ; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK-MAXBW: middle.block: -; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> [[TMP20]]) +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK-MAXBW: scalar.ph: @@ -2319,17 +2348,16 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ [[TMP1]], [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE21:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ [[TMP1]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE20:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE19:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE18:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE17:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE16:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE15:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = shl nsw i64 [[INDEX]], 3 ; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP11]] ; CHECK-INTERLEAVE1-NEXT: [[WIDE_VEC:%.*]] = load <128 x i8>, ptr [[TMP12]], align 1 @@ -2341,42 +2369,43 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-INTERLEAVE1-NEXT: [[STRIDED_VEC12:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 69, i32 77, i32 85, i32 93, i32 101, i32 109, i32 117, i32 125> ; CHECK-INTERLEAVE1-NEXT: [[STRIDED_VEC13:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 70, i32 78, i32 86, i32 94, i32 102, i32 110, i32 118, i32 126> ; CHECK-INTERLEAVE1-NEXT: [[STRIDED_VEC14:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63, i32 71, i32 79, i32 87, i32 95, i32 103, i32 111, i32 119, i32 127> -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[STRIDED_VEC]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP13]], [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[TMP15]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP14]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = sext <16 x i8> [[STRIDED_VEC8]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP29]], [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[TMP18]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP16]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP35:%.*]] = sext <16 x i8> [[STRIDED_VEC]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP44:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul nsw <16 x i32> [[TMP35]], [[TMP44]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP13]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[STRIDED_VEC8]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP14]], [[TMP44]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE15]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP16]]) ; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[STRIDED_VEC9]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[TMP21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP31]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = sext <16 x i8> [[STRIDED_VEC10]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = mul nsw <16 x i32> [[TMP19]], [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[TMP24]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP20]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = sext <16 x i8> [[STRIDED_VEC11]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP32]], [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[TMP27]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP22]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[STRIDED_VEC12]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[TMP30]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP34]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[STRIDED_VEC13]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP25]], [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[TMP33]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP35:%.*]] = sext <16 x i8> [[STRIDED_VEC14]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = mul nsw <16 x i32> [[TMP35]], [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[TMP36]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP44]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP19]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[STRIDED_VEC10]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP44]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP22]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[STRIDED_VEC11]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP44]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE18]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP25]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = sext <16 x i8> [[STRIDED_VEC12]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = mul nsw <16 x i32> [[TMP26]], [[TMP44]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE19]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP28]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = sext <16 x i8> [[STRIDED_VEC13]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP29]], [[TMP44]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE20]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP31]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = sext <16 x i8> [[STRIDED_VEC14]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP32]], [[TMP44]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP34]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP36]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP33]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP40:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP30]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP27]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP24]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP43:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP21]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP18]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP15]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE19]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE18]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP40:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE17]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE16]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE15]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP43:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVE1: scalar.ph: @@ -2429,7 +2458,6 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = shl nsw i64 [[INDEX]], 3 ; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP11]] ; CHECK-INTERLEAVED-NEXT: [[WIDE_VEC:%.*]] = load <128 x i8>, ptr [[TMP12]], align 1 @@ -2441,42 +2469,43 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC12:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 69, i32 77, i32 85, i32 93, i32 101, i32 109, i32 117, i32 125> ; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC13:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 70, i32 78, i32 86, i32 94, i32 102, i32 110, i32 118, i32 126> ; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC14:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63, i32 71, i32 79, i32 87, i32 95, i32 103, i32 111, i32 119, i32 127> -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[STRIDED_VEC]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP13]], [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP14]]) -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[STRIDED_VEC8]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = sext <16 x i8> [[STRIDED_VEC]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul nsw <16 x i32> [[TMP35]], [[TMP44]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP13]]) +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[STRIDED_VEC8]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP14]], [[TMP44]] ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE15]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP16]]) ; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[STRIDED_VEC9]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP18]]) -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = sext <16 x i8> [[STRIDED_VEC10]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = mul nsw <16 x i32> [[TMP19]], [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP20]]) -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = sext <16 x i8> [[STRIDED_VEC11]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP21]], [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE18]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP22]]) -; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[STRIDED_VEC12]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE19]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP24]]) -; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[STRIDED_VEC13]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP25]], [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE20]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]]) -; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = sext <16 x i8> [[STRIDED_VEC14]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = mul nsw <16 x i32> [[TMP27]], [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]]) +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP44]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP19]]) +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[STRIDED_VEC10]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP44]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP22]]) +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[STRIDED_VEC11]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP44]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE18]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP25]]) +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = sext <16 x i8> [[STRIDED_VEC12]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = mul nsw <16 x i32> [[TMP26]], [[TMP44]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE19]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP28]]) +; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = sext <16 x i8> [[STRIDED_VEC13]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP45]], [[TMP44]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE20]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP31]]) +; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = sext <16 x i8> [[STRIDED_VEC14]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP32]], [[TMP44]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP34]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]]) -; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]]) -; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE19]]) -; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE18]]) -; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE17]]) -; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE16]]) -; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE15]]) -; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]]) +; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]]) +; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE19]]) +; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE18]]) +; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE17]]) +; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE16]]) +; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE15]]) +; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVED: scalar.ph: @@ -2529,7 +2558,6 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-MAXBW-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 -; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = shl nsw i64 [[INDEX]], 3 ; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP11]] ; CHECK-MAXBW-NEXT: [[WIDE_VEC:%.*]] = load <128 x i8>, ptr [[TMP12]], align 1 @@ -2541,42 +2569,43 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-MAXBW-NEXT: [[STRIDED_VEC12:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 69, i32 77, i32 85, i32 93, i32 101, i32 109, i32 117, i32 125> ; CHECK-MAXBW-NEXT: [[STRIDED_VEC13:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 70, i32 78, i32 86, i32 94, i32 102, i32 110, i32 118, i32 126> ; CHECK-MAXBW-NEXT: [[STRIDED_VEC14:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63, i32 71, i32 79, i32 87, i32 95, i32 103, i32 111, i32 119, i32 127> -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[STRIDED_VEC]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP13]], [[TMP10]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP14]]) -; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[STRIDED_VEC8]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP35:%.*]] = sext <16 x i8> [[STRIDED_VEC]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP44:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = mul nsw <16 x i32> [[TMP35]], [[TMP44]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP13]]) +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[STRIDED_VEC8]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP14]], [[TMP44]] ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE15]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP16]]) ; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[STRIDED_VEC9]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP10]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP18]]) -; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = sext <16 x i8> [[STRIDED_VEC10]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = mul nsw <16 x i32> [[TMP19]], [[TMP10]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP20]]) -; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = sext <16 x i8> [[STRIDED_VEC11]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP21]], [[TMP10]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE18]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP22]]) -; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[STRIDED_VEC12]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP10]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE19]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP24]]) -; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[STRIDED_VEC13]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP25]], [[TMP10]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE20]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]]) -; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = sext <16 x i8> [[STRIDED_VEC14]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = mul nsw <16 x i32> [[TMP27]], [[TMP10]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]]) +; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP44]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP19]]) +; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[STRIDED_VEC10]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP44]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP22]]) +; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[STRIDED_VEC11]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP44]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE18]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP25]]) +; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = sext <16 x i8> [[STRIDED_VEC12]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = mul nsw <16 x i32> [[TMP26]], [[TMP44]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE19]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP28]]) +; CHECK-MAXBW-NEXT: [[TMP45:%.*]] = sext <16 x i8> [[STRIDED_VEC13]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP45]], [[TMP44]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE20]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP31]]) +; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = sext <16 x i8> [[STRIDED_VEC14]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP32]], [[TMP44]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP34]]) ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] ; CHECK-MAXBW: middle.block: -; CHECK-MAXBW-NEXT: [[TMP30:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]]) -; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]]) -; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE19]]) -; CHECK-MAXBW-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE18]]) -; CHECK-MAXBW-NEXT: [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE17]]) -; CHECK-MAXBW-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE16]]) -; CHECK-MAXBW-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE15]]) -; CHECK-MAXBW-NEXT: [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) +; CHECK-MAXBW-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]]) +; CHECK-MAXBW-NEXT: [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]]) +; CHECK-MAXBW-NEXT: [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE19]]) +; CHECK-MAXBW-NEXT: [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE18]]) +; CHECK-MAXBW-NEXT: [[TMP40:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE17]]) +; CHECK-MAXBW-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE16]]) +; CHECK-MAXBW-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE15]]) +; CHECK-MAXBW-NEXT: [[TMP43:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK-MAXBW: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll index b308b92..bd9fae6 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll @@ -23,12 +23,12 @@ define i32 @partial_reduce_with_non_constant_start_value(ptr %src, i32 %rdx.star ; IC2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16 ; IC2-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 ; IC2-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 -; IC2-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; IC2-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> -; IC2-NEXT: [[TMP6:%.*]] = mul nuw nsw <16 x i32> [[TMP4]], [[TMP4]] +; IC2-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; IC2-NEXT: [[TMP7:%.*]] = mul nuw nsw <16 x i32> [[TMP5]], [[TMP5]] -; IC2-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]]) -; IC2-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP7]]) +; IC2-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]]) +; IC2-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; IC2-NEXT: [[TMP6:%.*]] = mul nuw nsw <16 x i32> [[TMP8]], [[TMP8]] +; IC2-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP6]]) ; IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; IC2-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IC2-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -80,18 +80,18 @@ define i32 @partial_reduce_with_non_constant_start_value(ptr %src, i32 %rdx.star ; IC4-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 ; IC4-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 ; IC4-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 -; IC4-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; IC4-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; IC4-NEXT: [[TMP13:%.*]] = mul nuw nsw <16 x i32> [[TMP9]], [[TMP9]] +; IC4-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP13]]) ; IC4-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> -; IC4-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> -; IC4-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i32> -; IC4-NEXT: [[TMP10:%.*]] = mul nuw nsw <16 x i32> [[TMP6]], [[TMP6]] ; IC4-NEXT: [[TMP11:%.*]] = mul nuw nsw <16 x i32> [[TMP7]], [[TMP7]] -; IC4-NEXT: [[TMP12:%.*]] = mul nuw nsw <16 x i32> [[TMP8]], [[TMP8]] -; IC4-NEXT: [[TMP13:%.*]] = mul nuw nsw <16 x i32> [[TMP9]], [[TMP9]] -; IC4-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]]) ; IC4-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP11]]) +; IC4-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> +; IC4-NEXT: [[TMP12:%.*]] = mul nuw nsw <16 x i32> [[TMP10]], [[TMP10]] ; IC4-NEXT: [[PARTIAL_REDUCE8]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP12]]) -; IC4-NEXT: [[PARTIAL_REDUCE9]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP13]]) +; IC4-NEXT: [[TMP14:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i32> +; IC4-NEXT: [[TMP16:%.*]] = mul nuw nsw <16 x i32> [[TMP14]], [[TMP14]] +; IC4-NEXT: [[PARTIAL_REDUCE9]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]]) ; IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 ; IC4-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IC4-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll index 7bb4715..6dae09e 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll @@ -18,11 +18,11 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP2]], [[TMP3]] ; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = sub <16 x i32> zeroinitializer, [[TMP4]] ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 @@ -48,19 +48,19 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP7]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP14]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP14]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = mul <16 x i32> [[TMP6]], [[TMP2]] -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP15]], [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = sub <16 x i32> zeroinitializer, [[TMP8]] -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = sub <16 x i32> zeroinitializer, [[TMP9]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]]) +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = mul <16 x i32> [[TMP4]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sub <16 x i32> zeroinitializer, [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP15]]) +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul <16 x i32> [[TMP8]], [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = sub <16 x i32> zeroinitializer, [[TMP10]] ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP11]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 @@ -78,27 +78,27 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: br label [[ENTRY:%.*]] ; CHECK-MAXBW: vector.ph: ; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 ; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: ; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[PARTIAL_REDUCE:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[PARTIAL_REDUCE:%.*]], [[FOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1 -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32> +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1 ; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1 -; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32> -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = mul <vscale x 8 x i32> [[TMP12]], [[TMP9]] -; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = sub <vscale x 8 x i32> zeroinitializer, [[TMP13]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP14]]) +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1 +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32> +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32> +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = mul <vscale x 16 x i32> [[TMP4]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[TMP6]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP8]]) ; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-MAXBW: middle.block: -; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]]) +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] ; CHECK-MAXBW: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll index 70532ad..46ec858 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll @@ -45,8 +45,8 @@ define i32 @zext_add_reduc_i8_i32_sve(ptr %a) #0 { ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]]) +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP4]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 @@ -138,8 +138,8 @@ define i32 @zext_add_reduc_i8_i32_neon(ptr %a) #2 { ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]]) +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP4]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 @@ -227,8 +227,8 @@ define i64 @zext_add_reduc_i8_i64(ptr %a) #0 { ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64> ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP4]]) +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64> ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP5]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 @@ -321,8 +321,8 @@ define i64 @zext_add_reduc_i16_i64(ptr %a) #0 { ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2 ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD2]] to <8 x i64> ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP4]]) +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD2]] to <8 x i64> ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI1]], <8 x i64> [[TMP5]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 @@ -421,12 +421,12 @@ define i32 @zext_add_reduc_i8_i32_has_neon_dotprod(ptr %a) #1 { ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]]) +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP5]]) +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE8]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP10]]) +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE9]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP7]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 ; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 @@ -811,8 +811,8 @@ define i32 @sext_add_reduc_i8_i32(ptr %a) #0 { ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]]) +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP5]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 @@ -1000,11 +1000,10 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 { ; CHECK-INTERLEAVE1: vector.ph: ; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 16 ; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]] -; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0 -; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer ; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = add i32 [[D]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[A]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0 +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer ; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ] @@ -1012,7 +1011,8 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 { ; CHECK-INTERLEAVE1-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[D]], [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]] ; CHECK-INTERLEAVE1-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP4]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] @@ -1031,23 +1031,23 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 { ; CHECK-INTERLEAVED: vector.ph: ; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 32 ; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i32 [[D]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[A]], i32 0 ; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0 ; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i32 [[D]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[A]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE2:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[D]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]] ; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16 ; CHECK-INTERLEAVED-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP4]], align 1 ; CHECK-INTERLEAVED-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP6]], align 1 -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]]) -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP3]]) +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP5]]) ; CHECK-INTERLEAVED-NEXT: [[TMP22]] = add nuw i32 [[VEC_PHI1]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP22]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] @@ -1071,11 +1071,10 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 { ; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 16 ; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], [[TMP4]] ; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]] -; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[C]], i64 0 -; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer ; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = add i32 [[D]], [[N_VEC]] ; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[A]], i32 0 -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = zext <vscale x 16 x i8> [[BROADCAST_SPLAT]] to <vscale x 16 x i32> +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[C]], i64 0 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer ; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: ; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ] @@ -1083,6 +1082,7 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 { ; CHECK-MAXBW-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[D]], [[INDEX]] ; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]] ; CHECK-MAXBW-NEXT: store <vscale x 16 x i8> zeroinitializer, ptr [[TMP10]], align 1 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = zext <vscale x 16 x i8> [[BROADCAST_SPLAT]] to <vscale x 16 x i32> ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP9]]) ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]] ; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] @@ -1164,12 +1164,12 @@ define i64 @sext_reduction_i32_to_i64(ptr %arr, i64 %n) #1 { ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP14]], align 4 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4 ; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <4 x i32> [[WIDE_LOAD4]] to <4 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = sext <4 x i32> [[WIDE_LOAD5]] to <4 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = sext <4 x i32> [[WIDE_LOAD6]] to <4 x i64> ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI]], <4 x i64> [[TMP15]]) +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <4 x i32> [[WIDE_LOAD4]] to <4 x i64> ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI1]], <4 x i64> [[TMP5]]) +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = sext <4 x i32> [[WIDE_LOAD5]] to <4 x i64> ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE8]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI2]], <4 x i64> [[TMP6]]) +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = sext <4 x i32> [[WIDE_LOAD6]] to <4 x i64> ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE9]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI3]], <4 x i64> [[TMP7]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll index ebf4a4f..ab1486d 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll @@ -37,7 +37,7 @@ for.end: ; preds = %for.body ret i32 %conv27 } -attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" } !llvm.ident = !{!0} !0 = !{!"clang"} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll index 25ee100..70685c1 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll @@ -192,7 +192,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) { ; CHECK: LV(REG): VF = 16 ; CHECK-NEXT: LV(REG): Found max usage: 2 item ; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 9 registers -; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 12 registers +; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 6 registers ; CHECK-NEXT: LV(REG): Found invariant usage: 1 item entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll index 786a2aa..28d2a27 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll @@ -1630,4 +1630,4 @@ for.body: ; preds = %for.body, %entry } attributes #1 = { "target-features"="+sve" vscale_range(1, 16) } -attributes #0 = { "unsafe-fp-math"="true" "target-features"="+sve" vscale_range(1, 16) } +attributes #0 = { "target-features"="+sve" vscale_range(1, 16) } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-multi-block.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-multi-block.ll new file mode 100644 index 0000000..46b0ebd --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-multi-block.ll @@ -0,0 +1,276 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6 +; RUN: opt -p loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=VF2IC1 %s +; RUN: opt -p loop-vectorize -force-vector-width=2 -force-vector-interleave=2 -S %s | FileCheck --check-prefixes=VF2IC2 %s + +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32" +target triple = "arm64-apple-macosx" + +define void @load_store_interleave_group_block_invar_cond(ptr noalias %data, ptr noalias %dst.0, ptr noalias %dst.1, i1 %c) { +; VF2IC1-LABEL: define void @load_store_interleave_group_block_invar_cond( +; VF2IC1-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[DST_0:%.*]], ptr noalias [[DST_1:%.*]], i1 [[C:%.*]]) { +; VF2IC1-NEXT: [[ENTRY:.*:]] +; VF2IC1-NEXT: br label %[[VECTOR_PH:.*]] +; VF2IC1: [[VECTOR_PH]]: +; VF2IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2IC1: [[VECTOR_BODY]]: +; VF2IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE3:.*]] ] +; VF2IC1-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1 +; VF2IC1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]] +; VF2IC1-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8 +; VF2IC1-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2> +; VF2IC1-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3> +; VF2IC1-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; VF2IC1-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; VF2IC1-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8 +; VF2IC1-NEXT: br i1 [[C]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; VF2IC1: [[PRED_STORE_IF]]: +; VF2IC1-NEXT: store i8 1, ptr [[DST_0]], align 1 +; VF2IC1-NEXT: br label %[[PRED_STORE_CONTINUE]] +; VF2IC1: [[PRED_STORE_CONTINUE]]: +; VF2IC1-NEXT: br i1 [[C]], label %[[PRED_STORE_IF2:.*]], label %[[PRED_STORE_CONTINUE3]] +; VF2IC1: [[PRED_STORE_IF2]]: +; VF2IC1-NEXT: store i8 1, ptr [[DST_0]], align 1 +; VF2IC1-NEXT: br label %[[PRED_STORE_CONTINUE3]] +; VF2IC1: [[PRED_STORE_CONTINUE3]]: +; VF2IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[DST_1]], i64 [[INDEX]] +; VF2IC1-NEXT: store <2 x i8> zeroinitializer, ptr [[TMP2]], align 1 +; VF2IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; VF2IC1-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; VF2IC1-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; VF2IC1: [[MIDDLE_BLOCK]]: +; VF2IC1-NEXT: br label %[[EXIT:.*]] +; VF2IC1: [[EXIT]]: +; VF2IC1-NEXT: ret void +; +; VF2IC2-LABEL: define void @load_store_interleave_group_block_invar_cond( +; VF2IC2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[DST_0:%.*]], ptr noalias [[DST_1:%.*]], i1 [[C:%.*]]) { +; VF2IC2-NEXT: [[ENTRY:.*:]] +; VF2IC2-NEXT: br label %[[VECTOR_PH:.*]] +; VF2IC2: [[VECTOR_PH]]: +; VF2IC2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2IC2: [[VECTOR_BODY]]: +; VF2IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE11:.*]] ] +; VF2IC2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 2 +; VF2IC2-NEXT: [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1 +; VF2IC2-NEXT: [[TMP2:%.*]] = shl nsw i64 [[TMP0]], 1 +; VF2IC2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]] +; VF2IC2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP2]] +; VF2IC2-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8 +; VF2IC2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2> +; VF2IC2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3> +; VF2IC2-NEXT: [[WIDE_VEC2:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 +; VF2IC2-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 0, i32 2> +; VF2IC2-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 1, i32 3> +; VF2IC2-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; VF2IC2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; VF2IC2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8 +; VF2IC2-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC3]], <2 x i64> [[STRIDED_VEC4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; VF2IC2-NEXT: [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; VF2IC2-NEXT: store <4 x i64> [[INTERLEAVED_VEC5]], ptr [[TMP4]], align 8 +; VF2IC2-NEXT: br i1 [[C]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; VF2IC2: [[PRED_STORE_IF]]: +; VF2IC2-NEXT: store i8 1, ptr [[DST_0]], align 1 +; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE]] +; VF2IC2: [[PRED_STORE_CONTINUE]]: +; VF2IC2-NEXT: br i1 [[C]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7:.*]] +; VF2IC2: [[PRED_STORE_IF6]]: +; VF2IC2-NEXT: store i8 1, ptr [[DST_0]], align 1 +; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE7]] +; VF2IC2: [[PRED_STORE_CONTINUE7]]: +; VF2IC2-NEXT: br i1 [[C]], label %[[PRED_STORE_IF8:.*]], label %[[PRED_STORE_CONTINUE9:.*]] +; VF2IC2: [[PRED_STORE_IF8]]: +; VF2IC2-NEXT: store i8 1, ptr [[DST_0]], align 1 +; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE9]] +; VF2IC2: [[PRED_STORE_CONTINUE9]]: +; VF2IC2-NEXT: br i1 [[C]], label %[[PRED_STORE_IF10:.*]], label %[[PRED_STORE_CONTINUE11]] +; VF2IC2: [[PRED_STORE_IF10]]: +; VF2IC2-NEXT: store i8 1, ptr [[DST_0]], align 1 +; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE11]] +; VF2IC2: [[PRED_STORE_CONTINUE11]]: +; VF2IC2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[DST_1]], i64 [[INDEX]] +; VF2IC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 2 +; VF2IC2-NEXT: store <2 x i8> zeroinitializer, ptr [[TMP5]], align 1 +; VF2IC2-NEXT: store <2 x i8> zeroinitializer, ptr [[TMP6]], align 1 +; VF2IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; VF2IC2-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; VF2IC2-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; VF2IC2: [[MIDDLE_BLOCK]]: +; VF2IC2-NEXT: br label %[[EXIT:.*]] +; VF2IC2: [[EXIT]]: +; VF2IC2-NEXT: ret void +; +entry: + br label %loop.header + +loop.header: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] + %mul.2 = shl nsw i64 %iv, 1 + %data.0 = getelementptr inbounds i64, ptr %data, i64 %mul.2 + %l.0 = load i64, ptr %data.0, align 8 + store i64 %l.0, ptr %data.0, align 8 + %add.1 = or disjoint i64 %mul.2, 1 + %data.1 = getelementptr inbounds i64, ptr %data, i64 %add.1 + %l.1 = load i64, ptr %data.1, align 8 + store i64 %l.1, ptr %data.1, align 8 + br i1 %c, label %then, label %loop.latch + +then: + store i8 1, ptr %dst.0 + br label %loop.latch + +loop.latch: + %gep.dst.1 = getelementptr inbounds i8, ptr %dst.1, i64 %iv + store i8 0, ptr %gep.dst.1 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 100 + br i1 %ec, label %exit, label %loop.header + +exit: + ret void +} + +define void @load_store_interleave_group_block_var_cond(ptr noalias %data, ptr %masks, ptr noalias %dst) { +; VF2IC1-LABEL: define void @load_store_interleave_group_block_var_cond( +; VF2IC1-SAME: ptr noalias [[DATA:%.*]], ptr [[MASKS:%.*]], ptr noalias [[DST:%.*]]) { +; VF2IC1-NEXT: [[ENTRY:.*:]] +; VF2IC1-NEXT: br label %[[VECTOR_PH:.*]] +; VF2IC1: [[VECTOR_PH]]: +; VF2IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2IC1: [[VECTOR_BODY]]: +; VF2IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE3:.*]] ] +; VF2IC1-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1 +; VF2IC1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]] +; VF2IC1-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8 +; VF2IC1-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2> +; VF2IC1-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3> +; VF2IC1-NEXT: [[TMP11:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; VF2IC1-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; VF2IC1-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8 +; VF2IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[INDEX]] +; VF2IC1-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i8>, ptr [[TMP2]], align 1 +; VF2IC1-NEXT: [[TMP3:%.*]] = icmp eq <2 x i8> [[WIDE_LOAD1]], zeroinitializer +; VF2IC1-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0 +; VF2IC1-NEXT: br i1 [[TMP4]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; VF2IC1: [[PRED_STORE_IF]]: +; VF2IC1-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; VF2IC1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[TMP5]] +; VF2IC1-NEXT: store i8 1, ptr [[TMP6]], align 1 +; VF2IC1-NEXT: br label %[[PRED_STORE_CONTINUE]] +; VF2IC1: [[PRED_STORE_CONTINUE]]: +; VF2IC1-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1 +; VF2IC1-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF2:.*]], label %[[PRED_STORE_CONTINUE3]] +; VF2IC1: [[PRED_STORE_IF2]]: +; VF2IC1-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 1 +; VF2IC1-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[TMP8]] +; VF2IC1-NEXT: store i8 1, ptr [[TMP9]], align 1 +; VF2IC1-NEXT: br label %[[PRED_STORE_CONTINUE3]] +; VF2IC1: [[PRED_STORE_CONTINUE3]]: +; VF2IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; VF2IC1-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; VF2IC1-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; VF2IC1: [[MIDDLE_BLOCK]]: +; VF2IC1-NEXT: br label %[[EXIT:.*]] +; VF2IC1: [[EXIT]]: +; VF2IC1-NEXT: ret void +; +; VF2IC2-LABEL: define void @load_store_interleave_group_block_var_cond( +; VF2IC2-SAME: ptr noalias [[DATA:%.*]], ptr [[MASKS:%.*]], ptr noalias [[DST:%.*]]) { +; VF2IC2-NEXT: [[ENTRY:.*:]] +; VF2IC2-NEXT: br label %[[VECTOR_PH:.*]] +; VF2IC2: [[VECTOR_PH]]: +; VF2IC2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2IC2: [[VECTOR_BODY]]: +; VF2IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE12:.*]] ] +; VF2IC2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 2 +; VF2IC2-NEXT: [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1 +; VF2IC2-NEXT: [[TMP2:%.*]] = shl nsw i64 [[TMP0]], 1 +; VF2IC2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]] +; VF2IC2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP2]] +; VF2IC2-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8 +; VF2IC2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2> +; VF2IC2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3> +; VF2IC2-NEXT: [[WIDE_VEC2:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 +; VF2IC2-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 0, i32 2> +; VF2IC2-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 1, i32 3> +; VF2IC2-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; VF2IC2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; VF2IC2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8 +; VF2IC2-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC3]], <2 x i64> [[STRIDED_VEC4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; VF2IC2-NEXT: [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; VF2IC2-NEXT: store <4 x i64> [[INTERLEAVED_VEC5]], ptr [[TMP4]], align 8 +; VF2IC2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[INDEX]] +; VF2IC2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 2 +; VF2IC2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP7]], align 1 +; VF2IC2-NEXT: [[WIDE_LOAD6:%.*]] = load <2 x i8>, ptr [[TMP8]], align 1 +; VF2IC2-NEXT: [[TMP9:%.*]] = icmp eq <2 x i8> [[WIDE_LOAD]], zeroinitializer +; VF2IC2-NEXT: [[TMP10:%.*]] = icmp eq <2 x i8> [[WIDE_LOAD6]], zeroinitializer +; VF2IC2-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP9]], i32 0 +; VF2IC2-NEXT: br i1 [[TMP11]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; VF2IC2: [[PRED_STORE_IF]]: +; VF2IC2-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0 +; VF2IC2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[TMP12]] +; VF2IC2-NEXT: store i8 1, ptr [[TMP13]], align 1 +; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE]] +; VF2IC2: [[PRED_STORE_CONTINUE]]: +; VF2IC2-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1 +; VF2IC2-NEXT: br i1 [[TMP14]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] +; VF2IC2: [[PRED_STORE_IF7]]: +; VF2IC2-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 1 +; VF2IC2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[TMP15]] +; VF2IC2-NEXT: store i8 1, ptr [[TMP16]], align 1 +; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE8]] +; VF2IC2: [[PRED_STORE_CONTINUE8]]: +; VF2IC2-NEXT: [[TMP17:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0 +; VF2IC2-NEXT: br i1 [[TMP17]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] +; VF2IC2: [[PRED_STORE_IF9]]: +; VF2IC2-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 2 +; VF2IC2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[TMP18]] +; VF2IC2-NEXT: store i8 1, ptr [[TMP19]], align 1 +; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE10]] +; VF2IC2: [[PRED_STORE_CONTINUE10]]: +; VF2IC2-NEXT: [[TMP20:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1 +; VF2IC2-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12]] +; VF2IC2: [[PRED_STORE_IF11]]: +; VF2IC2-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 3 +; VF2IC2-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[TMP21]] +; VF2IC2-NEXT: store i8 1, ptr [[TMP22]], align 1 +; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE12]] +; VF2IC2: [[PRED_STORE_CONTINUE12]]: +; VF2IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; VF2IC2-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; VF2IC2-NEXT: br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; VF2IC2: [[MIDDLE_BLOCK]]: +; VF2IC2-NEXT: br label %[[EXIT:.*]] +; VF2IC2: [[EXIT]]: +; VF2IC2-NEXT: ret void +; +entry: + br label %loop.header + +loop.header: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] + %mul.2 = shl nsw i64 %iv, 1 + %data.0 = getelementptr inbounds i64, ptr %data, i64 %mul.2 + %l.0 = load i64, ptr %data.0, align 8 + store i64 %l.0, ptr %data.0, align 8 + %add.1 = or disjoint i64 %mul.2, 1 + %data.1 = getelementptr inbounds i64, ptr %data, i64 %add.1 + %l.1 = load i64, ptr %data.1, align 8 + store i64 %l.1, ptr %data.1, align 8 + %gep.mask = getelementptr inbounds i8, ptr %masks, i64 %iv + %l.mask = load i8, ptr %gep.mask + %c = icmp eq i8 %l.mask, 0 + br i1 %c, label %then, label %loop.latch + +then: + store i8 1, ptr %gep.mask + br label %loop.latch + +loop.latch: + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 100 + br i1 %ec, label %exit, label %loop.header + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll index d4e5dea..49f663f 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll @@ -23,18 +23,15 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) { ; CHECK-NEXT: <x1> vector loop: { ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> -; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi vp<[[RDX_START]]>, ir<[[REDUCE:%.+]]> (VF scaled by 1/4) +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi vp<[[RDX_START]]>, vp<[[REDUCE:%.+]]> (VF scaled by 1/4) ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]> ; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[PTR_A:%.+]]> = vector-pointer ir<%gep.a> ; CHECK-NEXT: WIDEN ir<%load.a> = load vp<[[PTR_A]]> -; CHECK-NEXT: WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32 ; CHECK-NEXT: CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[PTR_B:%.+]]> = vector-pointer ir<%gep.b> ; CHECK-NEXT: WIDEN ir<%load.b> = load vp<[[PTR_B]]> -; CHECK-NEXT: WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32 -; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a> -; CHECK-NEXT: PARTIAL-REDUCE ir<[[REDUCE]]> = add ir<[[ACC]]>, ir<%mul> +; CHECK-NEXT: EXPRESSION vp<[[REDUCE]]> = ir<[[ACC]]> + partial.reduce.add (mul (ir<%load.b> zext to i32), (ir<%load.a> zext to i32)) ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors @@ -42,7 +39,7 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) { ; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, ir<[[REDUCE]]> +; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, vp<[[REDUCE]]> ; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<1024>, vp<[[VEC_TC]]> ; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph @@ -89,10 +86,10 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) { ; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%accum> = phi vp<[[RDX_START]]>, ir<%add> (VF scaled by 1/4) ; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[EP_IV]]> ; CHECK-NEXT: WIDEN ir<%load.a> = load ir<%gep.a> -; CHECK-NEXT: WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32 ; CHECK-NEXT: CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[EP_IV]]> ; CHECK-NEXT: WIDEN ir<%load.b> = load ir<%gep.b> ; CHECK-NEXT: WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32 +; CHECK-NEXT: WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32 ; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a> ; CHECK-NEXT: PARTIAL-REDUCE ir<%add> = add ir<%accum>, ir<%mul> ; CHECK-NEXT: EMIT vp<[[EP_IV_NEXT:%.+]]> = add nuw vp<[[EP_IV]]>, ir<16> diff --git a/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll b/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll index 0f398a6..2579918 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll @@ -327,5 +327,5 @@ for.end: ; preds = %for.body, %entry declare float @fabsf(float) -attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "use-soft-float"="false" } +attributes #2 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "use-soft-float"="false" } diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll index 0a9b1e0..61e3a18 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll @@ -60,10 +60,10 @@ define i32 @vqdot(ptr %a, ptr %b) #0 { ; ZVQDOTQ-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 1 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; ZVQDOTQ-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP6]], align 1 -; ZVQDOTQ-NEXT: [[TMP8:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32> ; ZVQDOTQ-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; ZVQDOTQ-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1 ; ZVQDOTQ-NEXT: [[TMP11:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32> +; ZVQDOTQ-NEXT: [[TMP8:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32> ; ZVQDOTQ-NEXT: [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]] ; ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]]) ; ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] @@ -125,17 +125,17 @@ define i32 @vqdot(ptr %a, ptr %b) #0 { ; FIXED-ZVQDOTQ-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 -; FIXED-ZVQDOTQ-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32> -; FIXED-ZVQDOTQ-NEXT: [[TMP4:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32> ; FIXED-ZVQDOTQ-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; FIXED-ZVQDOTQ-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1 ; FIXED-ZVQDOTQ-NEXT: [[TMP8:%.*]] = sext <8 x i8> [[WIDE_LOAD3]] to <8 x i32> -; FIXED-ZVQDOTQ-NEXT: [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32> ; FIXED-ZVQDOTQ-NEXT: [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]] -; FIXED-ZVQDOTQ-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]] ; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]]) +; FIXED-ZVQDOTQ-NEXT: [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP12:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP12]] ; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]]) ; FIXED-ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; FIXED-ZVQDOTQ-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 @@ -222,10 +222,10 @@ define i32 @vqdotu(ptr %a, ptr %b) #0 { ; ZVQDOTQ-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 1 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; ZVQDOTQ-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP6]], align 1 -; ZVQDOTQ-NEXT: [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32> ; ZVQDOTQ-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; ZVQDOTQ-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1 ; ZVQDOTQ-NEXT: [[TMP11:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32> +; ZVQDOTQ-NEXT: [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32> ; ZVQDOTQ-NEXT: [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]] ; ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]]) ; ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] @@ -287,17 +287,17 @@ define i32 @vqdotu(ptr %a, ptr %b) #0 { ; FIXED-ZVQDOTQ-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 -; FIXED-ZVQDOTQ-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32> -; FIXED-ZVQDOTQ-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32> ; FIXED-ZVQDOTQ-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; FIXED-ZVQDOTQ-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1 ; FIXED-ZVQDOTQ-NEXT: [[TMP8:%.*]] = zext <8 x i8> [[WIDE_LOAD3]] to <8 x i32> -; FIXED-ZVQDOTQ-NEXT: [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32> ; FIXED-ZVQDOTQ-NEXT: [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]] -; FIXED-ZVQDOTQ-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]] ; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]]) +; FIXED-ZVQDOTQ-NEXT: [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP12:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP12]] ; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]]) ; FIXED-ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; FIXED-ZVQDOTQ-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 @@ -384,10 +384,10 @@ define i32 @vqdotsu(ptr %a, ptr %b) #0 { ; ZVQDOTQ-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 1 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; ZVQDOTQ-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP6]], align 1 -; ZVQDOTQ-NEXT: [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32> ; ZVQDOTQ-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; ZVQDOTQ-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1 ; ZVQDOTQ-NEXT: [[TMP11:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32> +; ZVQDOTQ-NEXT: [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32> ; ZVQDOTQ-NEXT: [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]] ; ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]]) ; ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] @@ -449,18 +449,18 @@ define i32 @vqdotsu(ptr %a, ptr %b) #0 { ; FIXED-ZVQDOTQ-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 -; FIXED-ZVQDOTQ-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32> -; FIXED-ZVQDOTQ-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32> ; FIXED-ZVQDOTQ-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; FIXED-ZVQDOTQ-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1 -; FIXED-ZVQDOTQ-NEXT: [[TMP8:%.*]] = sext <8 x i8> [[WIDE_LOAD3]] to <8 x i32> -; FIXED-ZVQDOTQ-NEXT: [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32> -; FIXED-ZVQDOTQ-NEXT: [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]] +; FIXED-ZVQDOTQ-NEXT: [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD3]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32> ; FIXED-ZVQDOTQ-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]] -; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]]) -; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]]) +; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP11]]) +; FIXED-ZVQDOTQ-NEXT: [[TMP10:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP8:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP14:%.*]] = mul <8 x i32> [[TMP10]], [[TMP8]] +; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP14]]) ; FIXED-ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; FIXED-ZVQDOTQ-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXED-ZVQDOTQ-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] @@ -545,10 +545,10 @@ define i32 @vqdotsu2(ptr %a, ptr %b) #0 { ; ZVQDOTQ-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 1 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; ZVQDOTQ-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP6]], align 1 -; ZVQDOTQ-NEXT: [[TMP8:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32> ; ZVQDOTQ-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; ZVQDOTQ-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1 ; ZVQDOTQ-NEXT: [[TMP11:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32> +; ZVQDOTQ-NEXT: [[TMP8:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32> ; ZVQDOTQ-NEXT: [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]] ; ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]]) ; ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] @@ -610,18 +610,18 @@ define i32 @vqdotsu2(ptr %a, ptr %b) #0 { ; FIXED-ZVQDOTQ-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 -; FIXED-ZVQDOTQ-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32> -; FIXED-ZVQDOTQ-NEXT: [[TMP4:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32> ; FIXED-ZVQDOTQ-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; FIXED-ZVQDOTQ-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1 -; FIXED-ZVQDOTQ-NEXT: [[TMP8:%.*]] = zext <8 x i8> [[WIDE_LOAD3]] to <8 x i32> -; FIXED-ZVQDOTQ-NEXT: [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32> -; FIXED-ZVQDOTQ-NEXT: [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]] +; FIXED-ZVQDOTQ-NEXT: [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD3]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP4:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32> ; FIXED-ZVQDOTQ-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]] -; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]]) -; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]]) +; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP11]]) +; FIXED-ZVQDOTQ-NEXT: [[TMP10:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP8:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP14:%.*]] = mul <8 x i32> [[TMP10]], [[TMP8]] +; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP14]]) ; FIXED-ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; FIXED-ZVQDOTQ-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXED-ZVQDOTQ-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/WebAssembly/partial-reduce-accumulate.ll b/llvm/test/Transforms/LoopVectorize/WebAssembly/partial-reduce-accumulate.ll new file mode 100644 index 0000000..2338da5 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/WebAssembly/partial-reduce-accumulate.ll @@ -0,0 +1,126 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -mattr=+simd128 -passes=loop-vectorize %s -S | FileCheck %s +; RUN: opt -mattr=+simd128 -passes=loop-vectorize -vectorizer-maximize-bandwidth %s -S | FileCheck %s --check-prefix=CHECK-MAX-BANDWIDTH + +target triple = "wasm32" + +define hidden i32 @accumulate_add_u8_u8(ptr noundef readonly %a, ptr noundef readonly %b, i32 noundef %N) { +; CHECK-LABEL: define hidden i32 @accumulate_add_u8_u8( +; CHECK-SAME: ptr noundef readonly [[A:%.*]], ptr noundef readonly [[B:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i8> [[WIDE_LOAD1]] to <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[VEC_PHI]], [[TMP3]] +; CHECK-NEXT: [[TMP8]] = add <4 x i32> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[ADD3:%.*]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[RESULT_0_LCSSA]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-NEXT: [[RED:%.*]] = phi i32 [ [[ADD3]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i32 [[IV]] +; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP11]] to i32 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i32 [[IV]] +; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CONV2:%.*]] = zext i8 [[TMP12]] to i32 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[RED]], [[CONV]] +; CHECK-NEXT: [[ADD3]] = add i32 [[ADD]], [[CONV2]] +; CHECK-NEXT: [[INC]] = add nuw i32 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; +; CHECK-MAX-BANDWIDTH-LABEL: define hidden i32 @accumulate_add_u8_u8( +; CHECK-MAX-BANDWIDTH-SAME: ptr noundef readonly [[A:%.*]], ptr noundef readonly [[B:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-MAX-BANDWIDTH-NEXT: [[ENTRY:.*]]: +; CHECK-MAX-BANDWIDTH-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 16 +; CHECK-MAX-BANDWIDTH-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-MAX-BANDWIDTH: [[VECTOR_PH]]: +; CHECK-MAX-BANDWIDTH-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 16 +; CHECK-MAX-BANDWIDTH-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-MAX-BANDWIDTH-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-MAX-BANDWIDTH: [[VECTOR_BODY]]: +; CHECK-MAX-BANDWIDTH-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-MAX-BANDWIDTH-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE2:%.*]], %[[VECTOR_BODY]] ] +; CHECK-MAX-BANDWIDTH-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i32 [[INDEX]] +; CHECK-MAX-BANDWIDTH-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-MAX-BANDWIDTH-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i32 [[INDEX]] +; CHECK-MAX-BANDWIDTH-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-MAX-BANDWIDTH-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-MAX-BANDWIDTH-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]]) +; CHECK-MAX-BANDWIDTH-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-MAX-BANDWIDTH-NEXT: [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP6]]) +; CHECK-MAX-BANDWIDTH-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 +; CHECK-MAX-BANDWIDTH-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAX-BANDWIDTH-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-MAX-BANDWIDTH: [[MIDDLE_BLOCK]]: +; CHECK-MAX-BANDWIDTH-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE2]]) +; CHECK-MAX-BANDWIDTH-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-MAX-BANDWIDTH-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; CHECK-MAX-BANDWIDTH: [[SCALAR_PH]]: +; CHECK-MAX-BANDWIDTH-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-MAX-BANDWIDTH-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-MAX-BANDWIDTH-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-MAX-BANDWIDTH: [[FOR_COND_CLEANUP]]: +; CHECK-MAX-BANDWIDTH-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[ADD3:%.*]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] +; CHECK-MAX-BANDWIDTH-NEXT: ret i32 [[RESULT_0_LCSSA]] +; CHECK-MAX-BANDWIDTH: [[FOR_BODY]]: +; CHECK-MAX-BANDWIDTH-NEXT: [[IV:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-MAX-BANDWIDTH-NEXT: [[RED:%.*]] = phi i32 [ [[ADD3]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] +; CHECK-MAX-BANDWIDTH-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i32 [[IV]] +; CHECK-MAX-BANDWIDTH-NEXT: [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-MAX-BANDWIDTH-NEXT: [[CONV:%.*]] = zext i8 [[TMP11]] to i32 +; CHECK-MAX-BANDWIDTH-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i32 [[IV]] +; CHECK-MAX-BANDWIDTH-NEXT: [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-MAX-BANDWIDTH-NEXT: [[CONV2:%.*]] = zext i8 [[TMP12]] to i32 +; CHECK-MAX-BANDWIDTH-NEXT: [[ADD:%.*]] = add i32 [[RED]], [[CONV]] +; CHECK-MAX-BANDWIDTH-NEXT: [[ADD3]] = add i32 [[ADD]], [[CONV2]] +; CHECK-MAX-BANDWIDTH-NEXT: [[INC]] = add nuw i32 [[IV]], 1 +; CHECK-MAX-BANDWIDTH-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; CHECK-MAX-BANDWIDTH-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret i32 %add3 + +for.body: ; preds = %entry, %for.body + %iv = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %red = phi i32 [ %add3, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw i8, ptr %a, i32 %iv + %0 = load i8, ptr %arrayidx, align 1 + %conv = zext i8 %0 to i32 + %arrayidx1 = getelementptr inbounds nuw i8, ptr %b, i32 %iv + %1 = load i8, ptr %arrayidx1, align 1 + %conv2 = zext i8 %1 to i32 + %add = add i32 %red, %conv + %add3 = add i32 %add, %conv2 + %inc = add nuw i32 %iv, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} diff --git a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll index 9168ebf..08d39ea 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll @@ -214,7 +214,7 @@ for.end15: ; preds = %for.end.us, %entry ret void } -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" } !3 = !{!4, !5} !4 = !{!4} diff --git a/llvm/test/Transforms/LoopVectorize/X86/int128_no_gather.ll b/llvm/test/Transforms/LoopVectorize/X86/int128_no_gather.ll index 44e9d3e..b7f8be1 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/int128_no_gather.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/int128_no_gather.ll @@ -71,6 +71,6 @@ declare i32 @printf(ptr, ...) #1 ; Function Attrs: nounwind declare i32 @puts(ptr nocapture readonly) #2 -attributes #0 = { noinline nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { noinline nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "use-soft-float"="false" } +attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "use-soft-float"="false" } attributes #2 = { nounwind } diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll index f066000..52e90e4 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll @@ -362,4 +362,4 @@ for.body: br i1 %exitcond, label %for.cond.cleanup, label %for.body } -attributes #0 = { "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "unsafe-fp-math"="false" } +attributes #0 = { "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" } diff --git a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll index 005696a..e405fe7 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll @@ -335,4 +335,4 @@ for.body: ; preds = %for.body.preheader, br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit99 } -attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="broadwell" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512f,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+evex512,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-vzeroupper" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="broadwell" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512f,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+evex512,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-vzeroupper" "use-soft-float"="false" } diff --git a/llvm/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll b/llvm/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll index b98a2ea..c7550ca 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll @@ -143,7 +143,7 @@ for.inc: br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !11 } -attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" } !llvm.module.flags = !{!0} !llvm.ident = !{!1} diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll index 2c187ca..9471e4a 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll @@ -72,7 +72,7 @@ for.end: ; preds = %for.body, %entry ret void, !dbg !27 } -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "use-soft-float"="false" } !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!7, !8} diff --git a/llvm/test/Transforms/LoopVectorize/diag-missing-instr-debug-loc.ll b/llvm/test/Transforms/LoopVectorize/diag-missing-instr-debug-loc.ll index 8ce2a97..0656de4 100644 --- a/llvm/test/Transforms/LoopVectorize/diag-missing-instr-debug-loc.ll +++ b/llvm/test/Transforms/LoopVectorize/diag-missing-instr-debug-loc.ll @@ -50,7 +50,7 @@ for.body: ; preds = %for.body.preheader, br i1 %exitcond, label %for.cond.cleanup, label %for.body, !dbg !9, !llvm.loop !18 } -attributes #0 = { norecurse nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { norecurse nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "use-soft-float"="false" } !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4} diff --git a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll index 65bd36c..5e51624f 100644 --- a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll +++ b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll @@ -134,7 +134,7 @@ for.cond.cleanup: ret void, !dbg !44 } -attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "use-soft-float"="false" } !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4, !5} diff --git a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll index 4b7b714..77ec95a 100644 --- a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll +++ b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll @@ -145,7 +145,7 @@ for.body: ; preds = %entry, %for.body br i1 %exitcond, label %for.cond.cleanup, label %for.body, !dbg !43, !llvm.loop !55 } -attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "use-soft-float"="false" } !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4, !5} diff --git a/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll b/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll index 5cf99b8..3487331 100644 --- a/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll +++ b/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll @@ -135,7 +135,7 @@ thread-pre-split5: ; preds = %.lr.ph ret void } -attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } !llvm.ident = !{!0} diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll index 8efe29a..fc2e233 100644 --- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll @@ -783,7 +783,7 @@ for.body: ; preds = %for.body, %entry @SA = common global i32 0, align 4 @SB = common global float 0.000000e+00, align 4 -define void @int_float_struct(ptr nocapture readonly %A) #0 { +define void @int_float_struct(ptr nocapture readonly %A) { ; CHECK-LABEL: @int_float_struct( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[VECTOR_PH:%.*]] @@ -1546,5 +1546,3 @@ loop: end: ret void } - -attributes #0 = { "unsafe-fp-math"="true" } diff --git a/llvm/test/Transforms/LoopVectorize/metadata-width.ll b/llvm/test/Transforms/LoopVectorize/metadata-width.ll index ddf9029..22243d9 100644 --- a/llvm/test/Transforms/LoopVectorize/metadata-width.ll +++ b/llvm/test/Transforms/LoopVectorize/metadata-width.ll @@ -89,7 +89,7 @@ for.end: ; preds = %for.body, %entry ret void } -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" } !0 = !{!0, !1, !5} !1 = !{!"llvm.loop.vectorize.width", i32 8} diff --git a/llvm/test/Transforms/LoopVectorize/multiple-address-spaces.ll b/llvm/test/Transforms/LoopVectorize/multiple-address-spaces.ll index a1fc1b8..87b5a0b 100644 --- a/llvm/test/Transforms/LoopVectorize/multiple-address-spaces.ll +++ b/llvm/test/Transforms/LoopVectorize/multiple-address-spaces.ll @@ -56,4 +56,4 @@ for.end: ; preds = %for.body ret i32 0 } -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-address-space.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-address-space.ll index 705bbab..64ae730 100644 --- a/llvm/test/Transforms/LoopVectorize/runtime-check-address-space.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-check-address-space.ll @@ -218,4 +218,4 @@ for.end: ; preds = %for.body, %entry ret void } -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll index 1effb10..872a3929 100644 --- a/llvm/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll @@ -129,4 +129,4 @@ for.end: ; preds = %for.body ret void } -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } diff --git a/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll b/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll index 8224d6b..137e098 100644 --- a/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll +++ b/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll @@ -104,8 +104,8 @@ for.end26: ; preds = %for.cond4.for.end26 } declare i32 @fn2(double) #1 -attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" } !0 = !{!"int", !1} !1 = !{!"omnipotent char", !2} diff --git a/llvm/test/Transforms/MergeICmps/X86/int64-and-ptr.ll b/llvm/test/Transforms/MergeICmps/X86/int64-and-ptr.ll index e2f5007..cc187de 100644 --- a/llvm/test/Transforms/MergeICmps/X86/int64-and-ptr.ll +++ b/llvm/test/Transforms/MergeICmps/X86/int64-and-ptr.ll @@ -35,11 +35,7 @@ if.end5: ; preds = %if.then, %entry ret i1 %rez.0 } ; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.start.p0(ptr nocapture) #1 +declare void @llvm.lifetime.start.p0(ptr nocapture) ; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.end.p0(ptr nocapture) #1 - -attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { argmemonly nounwind } -attributes #2 = { nounwind } +declare void @llvm.lifetime.end.p0(ptr nocapture) diff --git a/llvm/test/Transforms/MergeICmps/X86/pr41917.ll b/llvm/test/Transforms/MergeICmps/X86/pr41917.ll index d4fd34a..d8065e8 100644 --- a/llvm/test/Transforms/MergeICmps/X86/pr41917.ll +++ b/llvm/test/Transforms/MergeICmps/X86/pr41917.ll @@ -7,13 +7,13 @@ target triple = "i386-pc-windows-msvc19.11.0" %class.a = type { i32, i32, i32, i32, i32 } ; Function Attrs: nounwind optsize -define dso_local zeroext i1 @pr41917(ptr byval(%class.a) nocapture readonly align 4 %g, ptr byval(%class.a) nocapture readonly align 4 %p2) local_unnamed_addr #0 { +define dso_local zeroext i1 @pr41917(ptr byval(%class.a) nocapture readonly align 4 %g, ptr byval(%class.a) nocapture readonly align 4 %p2) local_unnamed_addr { ; CHECK-LABEL: @pr41917( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CALL:%.*]] = tail call zeroext i1 @f2() #[[ATTR3:[0-9]+]] +; CHECK-NEXT: [[CALL:%.*]] = tail call zeroext i1 @f2() ; CHECK-NEXT: br i1 [[CALL]], label [[LAND_RHS:%.*]], label %"land.end+land.rhs3" ; CHECK: land.rhs: -; CHECK-NEXT: [[CALL1:%.*]] = tail call zeroext i1 @f2() #[[ATTR3]] +; CHECK-NEXT: [[CALL1:%.*]] = tail call zeroext i1 @f2() ; CHECK-NEXT: br label %"land.end+land.rhs3" ; CHECK: "land.end+land.rhs3": ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[CLASS_A:%.*]], ptr [[G:%.*]], i32 0, i32 1 @@ -25,11 +25,11 @@ define dso_local zeroext i1 @pr41917(ptr byval(%class.a) nocapture readonly alig ; CHECK-NEXT: ret i1 [[TMP2]] ; entry: - %call = tail call zeroext i1 @f2() #2 + %call = tail call zeroext i1 @f2() br i1 %call, label %land.rhs, label %land.end land.rhs: ; preds = %entry - %call1 = tail call zeroext i1 @f2() #2 + %call1 = tail call zeroext i1 @f2() br label %land.end land.end: ; preds = %land.rhs, %entry @@ -53,11 +53,7 @@ land.end6: ; preds = %land.rhs3, %land.en ret i1 %4 } -declare dso_local zeroext i1 @f2() local_unnamed_addr #1 - -attributes #0 = { nounwind optsize "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { optsize "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind optsize } +declare dso_local zeroext i1 @f2() local_unnamed_addr !llvm.module.flags = !{!0, !1} !llvm.ident = !{!2} diff --git a/llvm/test/Transforms/NewGVN/basic-cyclic-opt.ll b/llvm/test/Transforms/NewGVN/basic-cyclic-opt.ll index 1cf9fd9..12b4184 100644 --- a/llvm/test/Transforms/NewGVN/basic-cyclic-opt.ll +++ b/llvm/test/Transforms/NewGVN/basic-cyclic-opt.ll @@ -4,7 +4,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" ;; Function Attrs: nounwind ssp uwtable ;; We should eliminate the sub, and one of the phi nodes -define void @vnum_test1(ptr %data) #0 { +define void @vnum_test1(ptr %data) { ; CHECK-LABEL: @vnum_test1( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 3 @@ -74,7 +74,7 @@ bb19: ; preds = %bb4 ;; We should eliminate the sub, one of the phi nodes, prove the store of the sub ;; and the load of data are equivalent, that the load always produces constant 0, and ;; delete the load replacing it with constant 0. -define i32 @vnum_test2(ptr %data) #0 { +define i32 @vnum_test2(ptr %data) { ; CHECK-LABEL: @vnum_test2( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 3 @@ -142,11 +142,10 @@ bb21: ; preds = %bb4 ret i32 %p.0 } - ; Function Attrs: nounwind ssp uwtable ;; Same as test 2, with a conditional store of m-n, so it has to also discover ;; that data ends up with the same value no matter what branch is taken. -define i32 @vnum_test3(ptr %data) #0 { +define i32 @vnum_test3(ptr %data) { ; CHECK-LABEL: @vnum_test3( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 3 @@ -305,7 +304,6 @@ bb3: ; preds = %bb2 %tmp3 = sub i32 %tmp, %phi2 ret i32 %tmp3 } -attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } !llvm.ident = !{!0, !0, !0} diff --git a/llvm/test/Transforms/NewGVN/cond_br2-xfail.ll b/llvm/test/Transforms/NewGVN/cond_br2-xfail.ll index 8b2d662..b6da86b 100644 --- a/llvm/test/Transforms/NewGVN/cond_br2-xfail.ll +++ b/llvm/test/Transforms/NewGVN/cond_br2-xfail.ll @@ -10,7 +10,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 %"union.llvm::SmallVectorBase::U" = type { x86_fp80 } ; Function Attrs: ssp uwtable -define void @_Z4testv() #0 personality ptr @__gxx_personality_v0 { +define void @_Z4testv() personality ptr @__gxx_personality_v0 { ; CHECK: @_Z4testv() ; CHECK: invoke.cont: ; CHECK: br i1 true, label %new.notnull.i11, label %if.end.i14 @@ -18,7 +18,7 @@ define void @_Z4testv() #0 personality ptr @__gxx_personality_v0 { entry: %sv = alloca %"class.llvm::SmallVector", align 16 - call void @llvm.lifetime.start.p0(ptr %sv) #1 + call void @llvm.lifetime.start.p0(ptr %sv) %FirstEl.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector", ptr %sv, i64 0, i32 0, i32 0, i32 0, i32 0, i32 3 store ptr %FirstEl.i.i.i.i.i.i, ptr %sv, align 16, !tbaa !4 %EndX.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector", ptr %sv, i64 0, i32 0, i32 0, i32 0, i32 0, i32 1 @@ -83,11 +83,11 @@ invoke.cont3: ; preds = %invoke.cont2 br i1 %cmp.i.i.i.i19, label %_ZN4llvm11SmallVectorIiLj8EED1Ev.exit21, label %if.then.i.i.i20 if.then.i.i.i20: ; preds = %invoke.cont3 - call void @free(ptr %5) #1 + call void @free(ptr %5) br label %_ZN4llvm11SmallVectorIiLj8EED1Ev.exit21 _ZN4llvm11SmallVectorIiLj8EED1Ev.exit21: ; preds = %invoke.cont3, %if.then.i.i.i20 - call void @llvm.lifetime.end.p0(ptr %sv) #1 + call void @llvm.lifetime.end.p0(ptr %sv) ret void lpad: ; preds = %if.end.i14, %if.end.i, %invoke.cont2 @@ -98,7 +98,7 @@ lpad: ; preds = %if.end.i14, %if.end br i1 %cmp.i.i.i.i, label %eh.resume, label %if.then.i.i.i if.then.i.i.i: ; preds = %lpad - call void @free(ptr %7) #1 + call void @free(ptr %7) br label %eh.resume eh.resume: ; preds = %if.then.i.i.i, %lpad @@ -106,24 +106,19 @@ eh.resume: ; preds = %if.then.i.i.i, %lpa } ; Function Attrs: nounwind -declare void @llvm.lifetime.start.p0(ptr nocapture) #1 +declare void @llvm.lifetime.start.p0(ptr nocapture) declare i32 @__gxx_personality_v0(...) -declare void @_Z1gRN4llvm11SmallVectorIiLj8EEE(ptr) #2 +declare void @_Z1gRN4llvm11SmallVectorIiLj8EEE(ptr) ; Function Attrs: nounwind -declare void @llvm.lifetime.end.p0(ptr nocapture) #1 +declare void @llvm.lifetime.end.p0(ptr nocapture) -declare void @_ZN4llvm15SmallVectorBase8grow_podEmm(ptr, i64, i64) #2 +declare void @_ZN4llvm15SmallVectorBase8grow_podEmm(ptr, i64, i64) ; Function Attrs: nounwind -declare void @free(ptr nocapture) #3 - -attributes #0 = { ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind } -attributes #2 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +declare void @free(ptr nocapture) !0 = !{!"any pointer", !1} !1 = !{!"omnipotent char", !2} diff --git a/llvm/test/Transforms/NewGVN/equivalent-phi.ll b/llvm/test/Transforms/NewGVN/equivalent-phi.ll index ba4fc14..388b941 100644 --- a/llvm/test/Transforms/NewGVN/equivalent-phi.ll +++ b/llvm/test/Transforms/NewGVN/equivalent-phi.ll @@ -8,7 +8,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" ;; one set of indexing calculations and a load ; Function Attrs: nounwind ssp uwtable -define i32 @bar(i32 %arg, i32 %arg1, i32 %arg2) #0 { +define i32 @bar(i32 %arg, i32 %arg1, i32 %arg2) { ; CHECK-LABEL: @bar( ; CHECK-NEXT: bb: ; CHECK-NEXT: br label [[BB3:%.*]] @@ -59,8 +59,6 @@ bb20: ; preds = %bb17 ret i32 %tmp14 } -attributes #0 = { nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.module.flags = !{!0} !llvm.ident = !{!1} diff --git a/llvm/test/Transforms/NewGVN/memory-handling.ll b/llvm/test/Transforms/NewGVN/memory-handling.ll index f83d145..71e9041 100644 --- a/llvm/test/Transforms/NewGVN/memory-handling.ll +++ b/llvm/test/Transforms/NewGVN/memory-handling.ll @@ -282,10 +282,10 @@ declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1) #2 ; Function Attrs: inlinehint nounwind readonly uwtable declare i32 @tolower(i32) local_unnamed_addr #3 -attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind uwtable } +attributes #1 = { nounwind readnone } attributes #2 = { argmemonly nounwind } -attributes #3 = { inlinehint nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { inlinehint nounwind readonly uwtable } attributes #4 = { nounwind readnone } attributes #5 = { nounwind readonly } diff --git a/llvm/test/Transforms/NewGVN/pr31483.ll b/llvm/test/Transforms/NewGVN/pr31483.ll index 82e9a2a..c1fb836 100644 --- a/llvm/test/Transforms/NewGVN/pr31483.ll +++ b/llvm/test/Transforms/NewGVN/pr31483.ll @@ -6,7 +6,7 @@ target datalayout = "E-m:e-i64:64-n32:64" ;; Ensure we do not believe the indexing increments are unreachable due to incorrect memory ;; equivalence detection. In PR31483, we were deleting those blocks as unreachable ; Function Attrs: nounwind -define signext i32 @ham(ptr %arg, ptr %arg1) #0 { +define signext i32 @ham(ptr %arg, ptr %arg1) { ; CHECK-LABEL: @ham( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP:%.*]] = alloca ptr, align 8 @@ -89,12 +89,7 @@ bb23: ; preds = %bb2 ret i32 undef } -declare signext i32 @zot(ptr, ...) #1 +declare signext i32 @zot(ptr, ...) ; Function Attrs: nounwind -declare void @llvm.va_end(ptr) #2 - -attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind } - +declare void @llvm.va_end(ptr) diff --git a/llvm/test/Transforms/NewGVN/pr31501.ll b/llvm/test/Transforms/NewGVN/pr31501.ll index 353c693..b2ba42b 100644 --- a/llvm/test/Transforms/NewGVN/pr31501.ll +++ b/llvm/test/Transforms/NewGVN/pr31501.ll @@ -49,9 +49,9 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" %struct.wombat.28 = type <{ ptr, i8, i8, [6 x i8] }> ; Function Attrs: norecurse nounwind ssp uwtable -define weak_odr hidden ptr @quux(ptr %arg, ptr %arg1) local_unnamed_addr #0 align 2 { +define weak_odr hidden ptr @quux(ptr %arg, ptr %arg1) local_unnamed_addr align 2 { ; CHECK-LABEL: define weak_odr hidden ptr @quux( -; CHECK-SAME: ptr [[ARG:%.*]], ptr [[ARG1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] align 2 { +; CHECK-SAME: ptr [[ARG:%.*]], ptr [[ARG1:%.*]]) local_unnamed_addr align 2 { ; CHECK-NEXT: [[BB:.*]]: ; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds [[STRUCT_BARNEY:%.*]], ptr [[ARG]], i64 0, i32 3, i32 0, i32 0, i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8, !tbaa [[ANYPTR_TBAA2:![0-9]+]] @@ -112,8 +112,6 @@ bb21: ; preds = %bb19, %bb ret ptr %tmp22 } -attributes #0 = { norecurse nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.module.flags = !{!0} !llvm.ident = !{!1} diff --git a/llvm/test/Transforms/NewGVN/pr33187.ll b/llvm/test/Transforms/NewGVN/pr33187.ll index 969f172..61eb7a5 100644 --- a/llvm/test/Transforms/NewGVN/pr33187.ll +++ b/llvm/test/Transforms/NewGVN/pr33187.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ;; Ensure we don't change after value numbering by accidentally deleting the wrong expression. ; RUN: opt -passes=newgvn -S %s | FileCheck %s -define void @fn1(i1 %arg) local_unnamed_addr #0 { +define void @fn1(i1 %arg) local_unnamed_addr { ; CHECK-LABEL: @fn1( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[FOR_COND_PREHEADER:%.*]] @@ -89,8 +89,7 @@ if.end18: ; preds = %L, %while.body12 br label %while.cond10 } - -define void @hoge() local_unnamed_addr #0 { +define void @hoge() local_unnamed_addr { ; CHECK-LABEL: @hoge( ; CHECK-NEXT: bb: ; CHECK-NEXT: br label [[BB1:%.*]] @@ -108,9 +107,6 @@ bb1: ; preds = %bb1, %bb br label %bb1 } -attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } - - define void @a(i1 %arg) { ; CHECK-LABEL: @a( ; CHECK-NEXT: b: @@ -143,4 +139,3 @@ f: ; preds = %d %e = getelementptr i8, ptr %i, i64 1 br label %d } - diff --git a/llvm/test/Transforms/NewGVN/pr33305.ll b/llvm/test/Transforms/NewGVN/pr33305.ll index e742f14..ff645f8 100644 --- a/llvm/test/Transforms/NewGVN/pr33305.ll +++ b/llvm/test/Transforms/NewGVN/pr33305.ll @@ -16,9 +16,9 @@ target triple = "x86_64-apple-macosx10.12.0" @str.2 = private unnamed_addr constant [8 x i8] c"Screwed\00" ; Function Attrs: nounwind optsize ssp uwtable -define i32 @main() local_unnamed_addr #0 { +define i32 @main() local_unnamed_addr { ; CHECK-LABEL: define i32 @main( -; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-SAME: ) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[DOTPR_I:%.*]] = load i32, ptr @c, align 4, !tbaa [[INT_TBAA3:![0-9]+]] ; CHECK-NEXT: [[CMP13_I:%.*]] = icmp slt i32 [[DOTPR_I]], 1 @@ -77,7 +77,7 @@ define i32 @main() local_unnamed_addr #0 { ; CHECK-NEXT: br i1 [[TOBOOL]], label %[[IF_END:.*]], label %[[IF_THEN:.*]] ; CHECK: [[IF_THEN]]: ; CHECK-NEXT: [[PUTS2:%.*]] = tail call i32 @puts(ptr @str.2) -; CHECK-NEXT: tail call void @abort() #[[ATTR3:[0-9]+]] +; CHECK-NEXT: tail call void @abort() ; CHECK-NEXT: unreachable ; CHECK: [[IF_END]]: ; CHECK-NEXT: [[PUTS:%.*]] = tail call i32 @puts(ptr @str) @@ -153,7 +153,7 @@ fn1.exit: ; preds = %if.then.i, %for.end if.then: ; preds = %fn1.exit %puts2 = tail call i32 @puts(ptr @str.2) - tail call void @abort() #3 + tail call void @abort() unreachable if.end: ; preds = %fn1.exit @@ -162,15 +162,10 @@ if.end: ; preds = %fn1.exit } ; Function Attrs: noreturn nounwind optsize -declare void @abort() local_unnamed_addr #1 +declare void @abort() local_unnamed_addr ; Function Attrs: nounwind -declare i32 @puts(ptr nocapture readonly) local_unnamed_addr #2 - -attributes #0 = { nounwind optsize ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { noreturn nounwind optsize "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind } -attributes #3 = { noreturn nounwind optsize } +declare i32 @puts(ptr nocapture readonly) local_unnamed_addr !llvm.module.flags = !{!0, !1} !llvm.ident = !{!2} diff --git a/llvm/test/Transforms/NewGVN/pr34430.ll b/llvm/test/Transforms/NewGVN/pr34430.ll index 490ba433..62d5770 100644 --- a/llvm/test/Transforms/NewGVN/pr34430.ll +++ b/llvm/test/Transforms/NewGVN/pr34430.ll @@ -4,7 +4,7 @@ source_filename = "bugpoint-output-e4c7d0f.bc" ; Make sure we still properly resolve phi cycles when they involve predicateinfo copies of phis. -define void @hoge(i1 %arg) local_unnamed_addr #0 { +define void @hoge(i1 %arg) local_unnamed_addr { ; CHECK-LABEL: @hoge( ; CHECK-NEXT: bb: ; CHECK-NEXT: br i1 %arg, label [[BB6:%.*]], label [[BB1:%.*]] @@ -41,8 +41,6 @@ bb6: ; preds = %bb4, %bb2, %bb1, %b br label %bb4 } -attributes #0 = { norecurse noreturn nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.ident = !{!0} !0 = !{!"clang version 6.0.0"} diff --git a/llvm/test/Transforms/NewGVN/pr34452.ll b/llvm/test/Transforms/NewGVN/pr34452.ll index 48bdd88..7c38147 100644 --- a/llvm/test/Transforms/NewGVN/pr34452.ll +++ b/llvm/test/Transforms/NewGVN/pr34452.ll @@ -6,9 +6,9 @@ source_filename = "bugpoint-output-09f7a24.bc" @WHOLELINE = external local_unnamed_addr global i32, align 4 ; Function Attrs: nounwind uwtable -define void @sgrep() local_unnamed_addr #0 { +define void @sgrep() local_unnamed_addr { ; CHECK-LABEL: define void @sgrep( -; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-SAME: ) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr @WHOLELINE, align 4, !tbaa [[INT_TBAA1:![0-9]+]] ; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[TMP0]], 0 @@ -36,10 +36,7 @@ while.body.us: ; preds = %while.body.us, %ent } ; Function Attrs: nounwind readnone speculatable -declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) #1 - -attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone speculatable } +declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) !llvm.ident = !{!0} diff --git a/llvm/test/Transforms/OpenMP/dead_use.ll b/llvm/test/Transforms/OpenMP/dead_use.ll index 1c4b2c6..ad0f91c 100644 --- a/llvm/test/Transforms/OpenMP/dead_use.ll +++ b/llvm/test/Transforms/OpenMP/dead_use.ll @@ -6,9 +6,9 @@ @0 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @.str }, align 8 ; Function Attrs: nounwind uwtable -define dso_local i32 @b() #0 { +define dso_local i32 @b() { ; CHECK-LABEL: define dso_local i32 @b( -; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-SAME: ) { ; CHECK-NEXT: [[TMP1:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @a() ; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4 @@ -21,9 +21,9 @@ define dso_local i32 @b() #0 { } ; Function Attrs: nounwind uwtable -define internal i32 @a() #0 { +define internal i32 @a() { ; CHECK-LABEL: define internal i32 @a( -; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-SAME: ) { ; CHECK-NEXT: [[TMP1:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @b() ; CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB0:[0-9]+]], i32 0, ptr @.omp_outlined.) @@ -38,9 +38,9 @@ define internal i32 @a() #0 { } ; Function Attrs: norecurse nounwind uwtable -define internal void @.omp_outlined.(ptr noalias %0, ptr noalias %1) #1 { +define internal void @.omp_outlined.(ptr noalias %0, ptr noalias %1) { ; CHECK-LABEL: define internal void @.omp_outlined.( -; CHECK-SAME: ptr noalias [[TMP0:%.*]], ptr noalias [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-SAME: ptr noalias [[TMP0:%.*]], ptr noalias [[TMP1:%.*]]) { ; CHECK-NEXT: [[TMP3:%.*]] = alloca ptr, align 8 ; CHECK-NEXT: [[TMP4:%.*]] = alloca ptr, align 8 ; CHECK-NEXT: store ptr [[TMP0]], ptr [[TMP3]], align 8, !tbaa [[ANYPTR_TBAA2:![0-9]+]] @@ -55,11 +55,7 @@ define internal void @.omp_outlined.(ptr noalias %0, ptr noalias %1) #1 { } ; Function Attrs: nounwind -declare !callback !6 void @__kmpc_fork_call(ptr, i32, ptr, ...) #2 - -attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind } +declare !callback !6 void @__kmpc_fork_call(ptr, i32, ptr, ...) !llvm.module.flags = !{!0} !llvm.ident = !{!1} diff --git a/llvm/test/Transforms/OpenMP/icv_remarks.ll b/llvm/test/Transforms/OpenMP/icv_remarks.ll index f76d487..3caa56f 100644 --- a/llvm/test/Transforms/OpenMP/icv_remarks.ll +++ b/llvm/test/Transforms/OpenMP/icv_remarks.ll @@ -14,55 +14,48 @@ target triple = "x86_64-unknown-linux-gnu" ; CHECK-DAG: remark: icv_remarks.c:12:0: OpenMP ICV nthreads Value: IMPLEMENTATION_DEFINED ; CHECK-DAG: remark: icv_remarks.c:12:0: OpenMP ICV active_levels Value: 0 ; CHECK-DAG: remark: icv_remarks.c:12:0: OpenMP ICV cancel Value: 0 -define dso_local void @foo(i32 %a) local_unnamed_addr #0 !dbg !17 { +define dso_local void @foo(i32 %a) local_unnamed_addr !dbg !17 { entry: %.kmpc_loc.addr = alloca %struct.ident_t, align 8 call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 8 dereferenceable(24) %.kmpc_loc.addr, ptr nonnull align 8 dereferenceable(24) @0, i64 16, i1 false) call void @llvm.dbg.value(metadata i32 %a, metadata !19, metadata !DIExpression()), !dbg !21 - tail call void @omp_set_num_threads(i32 %a) #1, !dbg !22 - %call = tail call i32 @omp_get_max_threads() #1, !dbg !23 + tail call void @omp_set_num_threads(i32 %a), !dbg !22 + %call = tail call i32 @omp_get_max_threads(), !dbg !23 call void @llvm.dbg.value(metadata i32 %call, metadata !20, metadata !DIExpression()), !dbg !21 - tail call void @use(i32 %call) #1, !dbg !24 + tail call void @use(i32 %call), !dbg !24 %0 = getelementptr inbounds %struct.ident_t, ptr %.kmpc_loc.addr, i64 0, i32 4, !dbg !25 store ptr @1, ptr %0, align 8, !dbg !25, !tbaa !26 - call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr nonnull %.kmpc_loc.addr, i32 0, ptr @.omp_outlined.) #1, !dbg !25 + call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr nonnull %.kmpc_loc.addr, i32 0, ptr @.omp_outlined.), !dbg !25 ret void, !dbg !32 } -declare !dbg !4 dso_local void @omp_set_num_threads(i32) local_unnamed_addr #1 +declare !dbg !4 dso_local void @omp_set_num_threads(i32) local_unnamed_addr -declare !dbg !9 dso_local i32 @omp_get_max_threads() local_unnamed_addr #1 +declare !dbg !9 dso_local i32 @omp_get_max_threads() local_unnamed_addr -declare !dbg !12 dso_local void @use(i32) local_unnamed_addr #2 +declare !dbg !12 dso_local void @use(i32) local_unnamed_addr ; CHECK-DAG: remark: icv_remarks.c:18:0: OpenMP ICV nthreads Value: IMPLEMENTATION_DEFINED ; CHECK-DAG: remark: icv_remarks.c:18:0: OpenMP ICV active_levels Value: 0 ; CHECK-DAG: remark: icv_remarks.c:18:0: OpenMP ICV cancel Value: 0 -define internal void @.omp_outlined.(ptr noalias nocapture readnone %.global_tid., ptr noalias nocapture readnone %.bound_tid.) #3 !dbg !33 { +define internal void @.omp_outlined.(ptr noalias nocapture readnone %.global_tid., ptr noalias nocapture readnone %.bound_tid.) !dbg !33 { entry: call void @llvm.dbg.value(metadata ptr %.global_tid., metadata !41, metadata !DIExpression()), !dbg !43 call void @llvm.dbg.value(metadata ptr %.bound_tid., metadata !42, metadata !DIExpression()), !dbg !43 - call void @llvm.dbg.value(metadata ptr undef, metadata !44, metadata !DIExpression()) #1, !dbg !50 - call void @llvm.dbg.value(metadata ptr undef, metadata !47, metadata !DIExpression()) #1, !dbg !50 - tail call void @omp_set_num_threads(i32 10) #1, !dbg !52 - %call.i = tail call i32 @omp_get_max_threads() #1, !dbg !53 - call void @llvm.dbg.value(metadata i32 %call.i, metadata !48, metadata !DIExpression()) #1, !dbg !54 - tail call void @use(i32 %call.i) #1, !dbg !55 + call void @llvm.dbg.value(metadata ptr undef, metadata !44, metadata !DIExpression()), !dbg !50 + call void @llvm.dbg.value(metadata ptr undef, metadata !47, metadata !DIExpression()), !dbg !50 + tail call void @omp_set_num_threads(i32 10), !dbg !52 + %call.i = tail call i32 @omp_get_max_threads(), !dbg !53 + call void @llvm.dbg.value(metadata i32 %call.i, metadata !48, metadata !DIExpression()), !dbg !54 + tail call void @use(i32 %call.i), !dbg !55 ret void, !dbg !56 } -declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #4 +declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) -declare !callback !57 dso_local void @__kmpc_fork_call(ptr, i32, ptr, ...) local_unnamed_addr #1 +declare !callback !57 dso_local void @__kmpc_fork_call(ptr, i32, ptr, ...) local_unnamed_addr -declare void @llvm.dbg.value(metadata, metadata, metadata) #5 - -attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind } -attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #4 = { argmemonly nounwind willreturn } -attributes #5 = { nounwind readnone speculatable willreturn } +declare void @llvm.dbg.value(metadata, metadata, metadata) !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!13, !14, !15, !59} diff --git a/llvm/test/Transforms/PGOProfile/misexpect-branch-correct.ll b/llvm/test/Transforms/PGOProfile/misexpect-branch-correct.ll index 0a1efda..eb1cc5e 100644 --- a/llvm/test/Transforms/PGOProfile/misexpect-branch-correct.ll +++ b/llvm/test/Transforms/PGOProfile/misexpect-branch-correct.ll @@ -9,7 +9,6 @@ ; CHECK-NOT: remark: {{.*}} ; CHECK: !{!"branch_weights", i32 0, i32 200000} - ; ModuleID = 'misexpect-branch-correct.c' source_filename = "misexpect-branch-correct.c" target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @@ -19,14 +18,14 @@ target triple = "x86_64-unknown-linux-gnu" @outer_loop = constant i32 2000, align 4 ; Function Attrs: nounwind -define i32 @bar() #0 { +define i32 @bar() { entry: %rando = alloca i32, align 4 %x = alloca i32, align 4 - call void @llvm.lifetime.start.p0(ptr %rando) #4 + call void @llvm.lifetime.start.p0(ptr %rando) %call = call i32 (...) @buzz() store i32 %call, ptr %rando, align 4, !tbaa !3 - call void @llvm.lifetime.start.p0(ptr %x) #4 + call void @llvm.lifetime.start.p0(ptr %x) store i32 0, ptr %x, align 4, !tbaa !3 %0 = load i32, ptr %rando, align 4, !tbaa !3 %rem = srem i32 %0, 200000 @@ -52,31 +51,25 @@ if.else: ; preds = %entry if.end: ; preds = %if.else, %if.then %2 = load i32, ptr %x, align 4, !tbaa !3 - call void @llvm.lifetime.end.p0(ptr %x) #4 - call void @llvm.lifetime.end.p0(ptr %rando) #4 + call void @llvm.lifetime.end.p0(ptr %x) + call void @llvm.lifetime.end.p0(ptr %rando) ret i32 %2 } ; Function Attrs: argmemonly nounwind willreturn -declare void @llvm.lifetime.start.p0(ptr nocapture) #1 +declare void @llvm.lifetime.start.p0(ptr nocapture) -declare i32 @buzz(...) #2 +declare i32 @buzz(...) ; Function Attrs: nounwind readnone willreturn -declare i64 @llvm.expect.i64(i64, i64) #3 +declare i64 @llvm.expect.i64(i64, i64) -declare i32 @baz(i32) #2 +declare i32 @baz(i32) -declare i32 @foo(i32) #2 +declare i32 @foo(i32) ; Function Attrs: argmemonly nounwind willreturn -declare void @llvm.lifetime.end.p0(ptr nocapture) #1 - -attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { argmemonly nounwind willreturn } -attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { nounwind readnone willreturn } -attributes #4 = { nounwind } +declare void @llvm.lifetime.end.p0(ptr nocapture) !llvm.module.flags = !{!0, !1} !llvm.ident = !{!2} diff --git a/llvm/test/Transforms/PGOProfile/misexpect-branch-overflow.ll b/llvm/test/Transforms/PGOProfile/misexpect-branch-overflow.ll index 68b233e..3390825 100644 --- a/llvm/test/Transforms/PGOProfile/misexpect-branch-overflow.ll +++ b/llvm/test/Transforms/PGOProfile/misexpect-branch-overflow.ll @@ -16,14 +16,14 @@ target triple = "x86_64-unknown-linux-gnu" @outer_loop = constant i32 2000, align 4 ; Function Attrs: nounwind -define i32 @bar() #0 !dbg !6 { +define i32 @bar() !dbg !6 { entry: %rando = alloca i32, align 4 %x = alloca i32, align 4 - call void @llvm.lifetime.start.p0(ptr %rando) #4, !dbg !9 + call void @llvm.lifetime.start.p0(ptr %rando), !dbg !9 %call = call i32 (...) @buzz(), !dbg !9 store i32 %call, ptr %rando, align 4, !dbg !9, !tbaa !10 - call void @llvm.lifetime.start.p0(ptr %x) #4, !dbg !14 + call void @llvm.lifetime.start.p0(ptr %x), !dbg !14 store i32 0, ptr %x, align 4, !dbg !14, !tbaa !10 %0 = load i32, ptr %rando, align 4, !dbg !15, !tbaa !10 %rem = srem i32 %0, 200000, !dbg !15 @@ -49,31 +49,25 @@ if.else: ; preds = %entry if.end: ; preds = %if.else, %if.then %2 = load i32, ptr %x, align 4, !dbg !19, !tbaa !10 - call void @llvm.lifetime.end.p0(ptr %x) #4, !dbg !20 - call void @llvm.lifetime.end.p0(ptr %rando) #4, !dbg !20 + call void @llvm.lifetime.end.p0(ptr %x), !dbg !20 + call void @llvm.lifetime.end.p0(ptr %rando), !dbg !20 ret i32 %2, !dbg !19 } ; Function Attrs: argmemonly nounwind willreturn -declare void @llvm.lifetime.start.p0(ptr nocapture) #1 +declare void @llvm.lifetime.start.p0(ptr nocapture) -declare i32 @buzz(...) #2 +declare i32 @buzz(...) ; Function Attrs: nounwind readnone willreturn -declare i64 @llvm.expect.i64(i64, i64) #3 +declare i64 @llvm.expect.i64(i64, i64) -declare i32 @baz(i32) #2 +declare i32 @baz(i32) -declare i32 @foo(i32) #2 +declare i32 @foo(i32) ; Function Attrs: argmemonly nounwind willreturn -declare void @llvm.lifetime.end.p0(ptr nocapture) #1 - -attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { argmemonly nounwind willreturn } -attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { nounwind readnone willreturn } -attributes #4 = { nounwind } +declare void @llvm.lifetime.end.p0(ptr nocapture) !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4} diff --git a/llvm/test/Transforms/PGOProfile/misexpect-branch-stripped.ll b/llvm/test/Transforms/PGOProfile/misexpect-branch-stripped.ll index 2f188f5..d34708c 100644 --- a/llvm/test/Transforms/PGOProfile/misexpect-branch-stripped.ll +++ b/llvm/test/Transforms/PGOProfile/misexpect-branch-stripped.ll @@ -19,7 +19,6 @@ ; DISABLED-NOT: warning: <unknown>:0:0: 19.98% ; DISABLED-NOT: remark: <unknown>:0:0: Potential performance regression from use of the llvm.expect intrinsic: Annotation was correct on 19.98% (399668 / 2000000) of profiled executions. - ; ModuleID = 'misexpect-branch.c' source_filename = "misexpect-branch.c" target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @@ -29,14 +28,14 @@ target triple = "x86_64-unknown-linux-gnu" @outer_loop = constant i32 2000, align 4 ; Function Attrs: nounwind -define i32 @bar() #0 { +define i32 @bar() { entry: %rando = alloca i32, align 4 %x = alloca i32, align 4 - call void @llvm.lifetime.start.p0(ptr %rando) #4 + call void @llvm.lifetime.start.p0(ptr %rando) %call = call i32 (...) @buzz() store i32 %call, ptr %rando, align 4, !tbaa !3 - call void @llvm.lifetime.start.p0(ptr %x) #4 + call void @llvm.lifetime.start.p0(ptr %x) store i32 0, ptr %x, align 4, !tbaa !3 %0 = load i32, ptr %rando, align 4, !tbaa !3 %rem = srem i32 %0, 200000 @@ -62,31 +61,25 @@ if.else: ; preds = %entry if.end: ; preds = %if.else, %if.then %2 = load i32, ptr %x, align 4, !tbaa !3 - call void @llvm.lifetime.end.p0(ptr %x) #4 - call void @llvm.lifetime.end.p0(ptr %rando) #4 + call void @llvm.lifetime.end.p0(ptr %x) + call void @llvm.lifetime.end.p0(ptr %rando) ret i32 %2 } ; Function Attrs: argmemonly nounwind willreturn -declare void @llvm.lifetime.start.p0(ptr nocapture) #1 +declare void @llvm.lifetime.start.p0(ptr nocapture) -declare i32 @buzz(...) #2 +declare i32 @buzz(...) ; Function Attrs: nounwind readnone willreturn -declare i64 @llvm.expect.i64(i64, i64) #3 +declare i64 @llvm.expect.i64(i64, i64) -declare i32 @baz(i32) #2 +declare i32 @baz(i32) -declare i32 @foo(i32) #2 +declare i32 @foo(i32) ; Function Attrs: argmemonly nounwind willreturn -declare void @llvm.lifetime.end.p0(ptr nocapture) #1 - -attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { argmemonly nounwind willreturn } -attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { nounwind readnone willreturn } -attributes #4 = { nounwind } +declare void @llvm.lifetime.end.p0(ptr nocapture) !llvm.module.flags = !{!0, !1} !llvm.ident = !{!2} diff --git a/llvm/test/Transforms/PGOProfile/misexpect-branch-unpredictable.ll b/llvm/test/Transforms/PGOProfile/misexpect-branch-unpredictable.ll index 4add781..a43e632 100644 --- a/llvm/test/Transforms/PGOProfile/misexpect-branch-unpredictable.ll +++ b/llvm/test/Transforms/PGOProfile/misexpect-branch-unpredictable.ll @@ -7,7 +7,6 @@ ; CHECK-NOT: warning: {{.*}} ; CHECK-NOT: remark: {{.*}} - ; ModuleID = 'misexpect-branch-unpredictable.c' source_filename = "clang/test/Profile/misexpect-branch-unpredictable.c" target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @@ -17,14 +16,14 @@ target triple = "x86_64-unknown-linux-gnu" @outer_loop = constant i32 2000, align 4 ; Function Attrs: nounwind -define i32 @bar() #0 { +define i32 @bar() { entry: %rando = alloca i32, align 4 %x = alloca i32, align 4 - call void @llvm.lifetime.start.p0(ptr %rando) #3 + call void @llvm.lifetime.start.p0(ptr %rando) %call = call i32 (...) @buzz() store i32 %call, ptr %rando, align 4, !tbaa !2 - call void @llvm.lifetime.start.p0(ptr %x) #3 + call void @llvm.lifetime.start.p0(ptr %x) store i32 0, ptr %x, align 4, !tbaa !2 %0 = load i32, ptr %rando, align 4, !tbaa !2 %rem = srem i32 %0, 200000 @@ -49,27 +48,22 @@ if.else: ; preds = %entry if.end: ; preds = %if.else, %if.then %2 = load i32, ptr %x, align 4, !tbaa !2 - call void @llvm.lifetime.end.p0(ptr %x) #3 - call void @llvm.lifetime.end.p0(ptr %rando) #3 + call void @llvm.lifetime.end.p0(ptr %x) + call void @llvm.lifetime.end.p0(ptr %rando) ret i32 %2 } ; Function Attrs: argmemonly nounwind willreturn -declare void @llvm.lifetime.start.p0(ptr nocapture) #1 +declare void @llvm.lifetime.start.p0(ptr nocapture) -declare i32 @buzz(...) #2 +declare i32 @buzz(...) -declare i32 @baz(i32) #2 +declare i32 @baz(i32) -declare i32 @foo(i32) #2 +declare i32 @foo(i32) ; Function Attrs: argmemonly nounwind willreturn -declare void @llvm.lifetime.end.p0(ptr nocapture) #1 - -attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { argmemonly nounwind willreturn } -attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { nounwind } +declare void @llvm.lifetime.end.p0(ptr nocapture) !llvm.module.flags = !{!0} !llvm.ident = !{!1} diff --git a/llvm/test/Transforms/PGOProfile/misexpect-branch.ll b/llvm/test/Transforms/PGOProfile/misexpect-branch.ll index 5a7731b..7e01f7a 100644 --- a/llvm/test/Transforms/PGOProfile/misexpect-branch.ll +++ b/llvm/test/Transforms/PGOProfile/misexpect-branch.ll @@ -33,14 +33,14 @@ target triple = "x86_64-unknown-linux-gnu" @outer_loop = constant i32 2000, align 4 ; Function Attrs: nounwind -define i32 @bar() #0 !dbg !6 { +define i32 @bar() !dbg !6 { entry: %rando = alloca i32, align 4 %x = alloca i32, align 4 - call void @llvm.lifetime.start.p0(ptr %rando) #4, !dbg !9 + call void @llvm.lifetime.start.p0(ptr %rando), !dbg !9 %call = call i32 (...) @buzz(), !dbg !9 store i32 %call, ptr %rando, align 4, !dbg !9, !tbaa !10 - call void @llvm.lifetime.start.p0(ptr %x) #4, !dbg !14 + call void @llvm.lifetime.start.p0(ptr %x), !dbg !14 store i32 0, ptr %x, align 4, !dbg !14, !tbaa !10 %0 = load i32, ptr %rando, align 4, !dbg !15, !tbaa !10 %rem = srem i32 %0, 200000, !dbg !15 @@ -66,31 +66,25 @@ if.else: ; preds = %entry if.end: ; preds = %if.else, %if.then %2 = load i32, ptr %x, align 4, !dbg !19, !tbaa !10 - call void @llvm.lifetime.end.p0(ptr %x) #4, !dbg !20 - call void @llvm.lifetime.end.p0(ptr %rando) #4, !dbg !20 + call void @llvm.lifetime.end.p0(ptr %x), !dbg !20 + call void @llvm.lifetime.end.p0(ptr %rando), !dbg !20 ret i32 %2, !dbg !19 } ; Function Attrs: argmemonly nounwind willreturn -declare void @llvm.lifetime.start.p0(ptr nocapture) #1 +declare void @llvm.lifetime.start.p0(ptr nocapture) -declare i32 @buzz(...) #2 +declare i32 @buzz(...) ; Function Attrs: nounwind readnone willreturn -declare i64 @llvm.expect.i64(i64, i64) #3 +declare i64 @llvm.expect.i64(i64, i64) -declare i32 @baz(i32) #2 +declare i32 @baz(i32) -declare i32 @foo(i32) #2 +declare i32 @foo(i32) ; Function Attrs: argmemonly nounwind willreturn -declare void @llvm.lifetime.end.p0(ptr nocapture) #1 - -attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { argmemonly nounwind willreturn } -attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { nounwind readnone willreturn } -attributes #4 = { nounwind } +declare void @llvm.lifetime.end.p0(ptr nocapture) !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4} diff --git a/llvm/test/Transforms/PGOProfile/misexpect-switch-default.ll b/llvm/test/Transforms/PGOProfile/misexpect-switch-default.ll index 859ba72..38c27b9 100644 --- a/llvm/test/Transforms/PGOProfile/misexpect-switch-default.ll +++ b/llvm/test/Transforms/PGOProfile/misexpect-switch-default.ll @@ -25,7 +25,6 @@ ; CORRECT-NOT: warning: {{.*}} ; CORRECT-NOT: remark: {{.*}} - ; ModuleID = 'misexpect-switch.c' source_filename = "misexpect-switch.c" target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @@ -37,10 +36,10 @@ target triple = "x86_64-unknown-linux-gnu" @arry = dso_local global [25 x i32] zeroinitializer, align 16 ; Function Attrs: nounwind uwtable -define dso_local void @init_arry() #0 { +define dso_local void @init_arry() { entry: %i = alloca i32, align 4 - call void @llvm.lifetime.start.p0(ptr %i) #6 + call void @llvm.lifetime.start.p0(ptr %i) store i32 0, ptr %i, align 4, !tbaa !4 br label %for.cond @@ -50,7 +49,7 @@ for.cond: ; preds = %for.inc, %entry br i1 %cmp, label %for.body, label %for.end for.body: ; preds = %for.cond - %call = call i32 @rand() #6 + %call = call i32 @rand() %rem = srem i32 %call, 10 %1 = load i32, ptr %i, align 4, !tbaa !4 %idxprom = sext i32 %1 to i64 @@ -65,24 +64,24 @@ for.inc: ; preds = %for.body br label %for.cond for.end: ; preds = %for.cond - call void @llvm.lifetime.end.p0(ptr %i) #6 + call void @llvm.lifetime.end.p0(ptr %i) ret void } ; Function Attrs: argmemonly nounwind willreturn -declare void @llvm.lifetime.start.p0(ptr nocapture) #1 +declare void @llvm.lifetime.start.p0(ptr nocapture) ; Function Attrs: nounwind readnone speculatable willreturn -declare void @llvm.dbg.declare(metadata, metadata, metadata) #2 +declare void @llvm.dbg.declare(metadata, metadata, metadata) ; Function Attrs: nounwind -declare dso_local i32 @rand() #3 +declare dso_local i32 @rand() ; Function Attrs: argmemonly nounwind willreturn -declare void @llvm.lifetime.end.p0(ptr nocapture) #1 +declare void @llvm.lifetime.end.p0(ptr nocapture) ; Function Attrs: nounwind uwtable -define dso_local i32 @main() #0 { +define dso_local i32 @main() { entry: %retval = alloca i32, align 4 %val = alloca i32, align 4 @@ -90,9 +89,9 @@ entry: %condition = alloca i32, align 4 store i32 0, ptr %retval, align 4 call void @init_arry() - call void @llvm.lifetime.start.p0(ptr %val) #6 + call void @llvm.lifetime.start.p0(ptr %val) store i32 0, ptr %val, align 4, !tbaa !4 - call void @llvm.lifetime.start.p0(ptr %j) #6 + call void @llvm.lifetime.start.p0(ptr %j) store i32 0, ptr %j, align 4, !tbaa !4 br label %for.cond @@ -102,8 +101,8 @@ for.cond: ; preds = %for.inc, %entry br i1 %cmp, label %for.body, label %for.end for.body: ; preds = %for.cond - call void @llvm.lifetime.start.p0(ptr %condition) #6 - %call = call i32 @rand() #6 + call void @llvm.lifetime.start.p0(ptr %condition) + %call = call i32 @rand() %rem = srem i32 %call, 5 store i32 %rem, ptr %condition, align 4, !tbaa !4 %1 = load i32, ptr %condition, align 4, !tbaa !4 @@ -138,7 +137,7 @@ sw.default: ; preds = %for.body unreachable sw.epilog: ; preds = %sw.bb3, %sw.bb2, %sw.bb - call void @llvm.lifetime.end.p0(ptr %condition) #6 + call void @llvm.lifetime.end.p0(ptr %condition) br label %for.inc for.inc: ; preds = %sw.epilog @@ -148,25 +147,17 @@ for.inc: ; preds = %sw.epilog br label %for.cond for.end: ; preds = %for.cond - call void @llvm.lifetime.end.p0(ptr %j) #6 - call void @llvm.lifetime.end.p0(ptr %val) #6 + call void @llvm.lifetime.end.p0(ptr %j) + call void @llvm.lifetime.end.p0(ptr %val) ret i32 0 } ; Function Attrs: nounwind readnone willreturn -declare i64 @llvm.expect.i64(i64, i64) #4 - -declare dso_local i32 @sum(ptr, i32) #5 +declare i64 @llvm.expect.i64(i64, i64) -declare dso_local i32 @random_sample(ptr, i32) #5 +declare dso_local i32 @sum(ptr, i32) -attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { argmemonly nounwind willreturn } -attributes #2 = { nounwind readnone speculatable willreturn } -attributes #3 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #4 = { nounwind readnone willreturn } -attributes #5 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #6 = { nounwind } +declare dso_local i32 @random_sample(ptr, i32) !llvm.module.flags = !{!0, !1, !2} !llvm.ident = !{!3} diff --git a/llvm/test/Transforms/PGOProfile/misexpect-switch.ll b/llvm/test/Transforms/PGOProfile/misexpect-switch.ll index 242d5b8..9665559 100644 --- a/llvm/test/Transforms/PGOProfile/misexpect-switch.ll +++ b/llvm/test/Transforms/PGOProfile/misexpect-switch.ll @@ -28,7 +28,6 @@ ; CORRECT-NOT: warning: {{.*}} ; CORRECT-NOT: remark: {{.*}} - ; ModuleID = 'misexpect-switch.c' source_filename = "misexpect-switch.c" target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @@ -40,10 +39,10 @@ target triple = "x86_64-unknown-linux-gnu" @arry = dso_local global [25 x i32] zeroinitializer, align 16, !dbg !12 ; Function Attrs: nounwind uwtable -define dso_local void @init_arry() #0 !dbg !21 { +define dso_local void @init_arry() !dbg !21 { entry: %i = alloca i32, align 4 - call void @llvm.lifetime.start.p0(ptr %i) #6, !dbg !26 + call void @llvm.lifetime.start.p0(ptr %i), !dbg !26 call void @llvm.dbg.declare(metadata ptr %i, metadata !25, metadata !DIExpression()), !dbg !27 store i32 0, ptr %i, align 4, !dbg !28, !tbaa !30 br label %for.cond, !dbg !34 @@ -54,7 +53,7 @@ for.cond: ; preds = %for.inc, %entry br i1 %cmp, label %for.body, label %for.end, !dbg !38 for.body: ; preds = %for.cond - %call = call i32 @rand() #6, !dbg !39 + %call = call i32 @rand(), !dbg !39 %rem = srem i32 %call, 10, !dbg !41 %1 = load i32, ptr %i, align 4, !dbg !42, !tbaa !30 %idxprom = sext i32 %1 to i64, !dbg !43 @@ -69,24 +68,24 @@ for.inc: ; preds = %for.body br label %for.cond, !dbg !47, !llvm.loop !48 for.end: ; preds = %for.cond - call void @llvm.lifetime.end.p0(ptr %i) #6, !dbg !50 + call void @llvm.lifetime.end.p0(ptr %i), !dbg !50 ret void, !dbg !50 } ; Function Attrs: argmemonly nounwind willreturn -declare void @llvm.lifetime.start.p0(ptr nocapture) #1 +declare void @llvm.lifetime.start.p0(ptr nocapture) ; Function Attrs: nounwind readnone speculatable willreturn -declare void @llvm.dbg.declare(metadata, metadata, metadata) #2 +declare void @llvm.dbg.declare(metadata, metadata, metadata) ; Function Attrs: nounwind -declare dso_local i32 @rand() #3 +declare dso_local i32 @rand() ; Function Attrs: argmemonly nounwind willreturn -declare void @llvm.lifetime.end.p0(ptr nocapture) #1 +declare void @llvm.lifetime.end.p0(ptr nocapture) ; Function Attrs: nounwind uwtable -define dso_local i32 @main() #0 !dbg !51 { +define dso_local i32 @main() !dbg !51 { entry: %retval = alloca i32, align 4 %val = alloca i32, align 4 @@ -94,10 +93,10 @@ entry: %condition = alloca i32, align 4 store i32 0, ptr %retval, align 4 call void @init_arry(), !dbg !62 - call void @llvm.lifetime.start.p0(ptr %val) #6, !dbg !63 + call void @llvm.lifetime.start.p0(ptr %val), !dbg !63 call void @llvm.dbg.declare(metadata ptr %val, metadata !55, metadata !DIExpression()), !dbg !64 store i32 0, ptr %val, align 4, !dbg !64, !tbaa !30 - call void @llvm.lifetime.start.p0(ptr %j) #6, !dbg !65 + call void @llvm.lifetime.start.p0(ptr %j), !dbg !65 call void @llvm.dbg.declare(metadata ptr %j, metadata !56, metadata !DIExpression()), !dbg !66 store i32 0, ptr %j, align 4, !dbg !67, !tbaa !30 br label %for.cond, !dbg !68 @@ -108,9 +107,9 @@ for.cond: ; preds = %for.inc, %entry br i1 %cmp, label %for.body, label %for.end, !dbg !71 for.body: ; preds = %for.cond - call void @llvm.lifetime.start.p0(ptr %condition) #6, !dbg !72 + call void @llvm.lifetime.start.p0(ptr %condition), !dbg !72 call void @llvm.dbg.declare(metadata ptr %condition, metadata !57, metadata !DIExpression()), !dbg !73 - %call = call i32 @rand() #6, !dbg !74 + %call = call i32 @rand(), !dbg !74 %rem = srem i32 %call, 5, !dbg !75 store i32 %rem, ptr %condition, align 4, !dbg !73, !tbaa !30 %1 = load i32, ptr %condition, align 4, !dbg !76, !tbaa !30 @@ -145,7 +144,7 @@ sw.default: ; preds = %for.body unreachable, !dbg !87 sw.epilog: ; preds = %sw.bb3, %sw.bb2, %sw.bb - call void @llvm.lifetime.end.p0(ptr %condition) #6, !dbg !88 + call void @llvm.lifetime.end.p0(ptr %condition), !dbg !88 br label %for.inc, !dbg !89 for.inc: ; preds = %sw.epilog @@ -155,25 +154,17 @@ for.inc: ; preds = %sw.epilog br label %for.cond, !dbg !91, !llvm.loop !92 for.end: ; preds = %for.cond - call void @llvm.lifetime.end.p0(ptr %j) #6, !dbg !94 - call void @llvm.lifetime.end.p0(ptr %val) #6, !dbg !94 + call void @llvm.lifetime.end.p0(ptr %j), !dbg !94 + call void @llvm.lifetime.end.p0(ptr %val), !dbg !94 ret i32 0, !dbg !95 } ; Function Attrs: nounwind readnone willreturn -declare i64 @llvm.expect.i64(i64, i64) #4 - -declare dso_local i32 @sum(ptr, i32) #5 +declare i64 @llvm.expect.i64(i64, i64) -declare dso_local i32 @random_sample(ptr, i32) #5 +declare dso_local i32 @sum(ptr, i32) -attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { argmemonly nounwind willreturn } -attributes #2 = { nounwind readnone speculatable willreturn } -attributes #3 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #4 = { nounwind readnone willreturn } -attributes #5 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #6 = { nounwind } +declare dso_local i32 @random_sample(ptr, i32) !llvm.dbg.cu = !{!2} !llvm.module.flags = !{!17, !18, !19} |
