diff options
Diffstat (limited to 'llvm/test/Transforms')
47 files changed, 2350 insertions, 741 deletions
diff --git a/llvm/test/Transforms/ADCE/2016-09-06.ll b/llvm/test/Transforms/ADCE/2016-09-06.ll index 850f412..1329ac6 100644 --- a/llvm/test/Transforms/ADCE/2016-09-06.ll +++ b/llvm/test/Transforms/ADCE/2016-09-06.ll @@ -5,7 +5,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" ; Function Attrs: nounwind uwtable -define i32 @foo(i32, i32, i32) #0 { +define i32 @foo(i32, i32, i32) { %4 = alloca i32, align 4 %5 = alloca i32, align 4 %6 = alloca i32, align 4 @@ -48,8 +48,6 @@ B21: ret i32 %I22 } -attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.ident = !{!0} !0 = !{!"clang version 4.0.0"} diff --git a/llvm/test/Transforms/ADCE/blocks-with-dead-term-nondeterministic.ll b/llvm/test/Transforms/ADCE/blocks-with-dead-term-nondeterministic.ll index 9708be9..5e844b4 100644 --- a/llvm/test/Transforms/ADCE/blocks-with-dead-term-nondeterministic.ll +++ b/llvm/test/Transforms/ADCE/blocks-with-dead-term-nondeterministic.ll @@ -5,7 +5,7 @@ target triple = "x86_64-apple-macosx10.10.0" ; CHECK: uselistorder label %bb16, { 1, 0 } ; Function Attrs: noinline nounwind ssp uwtable -define void @ham(i1 %arg) local_unnamed_addr #0 { +define void @ham(i1 %arg) local_unnamed_addr { bb: br i1 false, label %bb1, label %bb22 @@ -64,8 +64,6 @@ bb22: ; preds = %bb21, %bb ret void } -attributes #0 = { noinline nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.module.flags = !{!0} !0 = !{i32 7, !"PIC Level", i32 2} diff --git a/llvm/test/Transforms/AddDiscriminators/basic.ll b/llvm/test/Transforms/AddDiscriminators/basic.ll index 5186537..fc4c10a 100644 --- a/llvm/test/Transforms/AddDiscriminators/basic.ll +++ b/llvm/test/Transforms/AddDiscriminators/basic.ll @@ -11,7 +11,7 @@ ; if (i < 10) x = i; ; } -define void @foo(i32 %i) #0 !dbg !4 { +define void @foo(i32 %i) !dbg !4 { entry: %i.addr = alloca i32, align 4 %x = alloca i32, align 4 @@ -35,8 +35,6 @@ if.end: ; preds = %if.then, %entry ; CHECK: ret void, !dbg ![[END:[0-9]+]] } -attributes #0 = { nounwind uwtable noinline optnone "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!7, !8} !llvm.ident = !{!9} diff --git a/llvm/test/Transforms/AddDiscriminators/call-nested.ll b/llvm/test/Transforms/AddDiscriminators/call-nested.ll index 99340a5..f1373e4 100644 --- a/llvm/test/Transforms/AddDiscriminators/call-nested.ll +++ b/llvm/test/Transforms/AddDiscriminators/call-nested.ll @@ -9,7 +9,7 @@ ; #6 } ; Function Attrs: uwtable -define i32 @_Z3bazv() #0 !dbg !4 { +define i32 @_Z3bazv() !dbg !4 { %1 = call i32 @_Z3barv(), !dbg !11 ; CHECK: %1 = call i32 @_Z3barv(), !dbg ![[CALL0:[0-9]+]] %2 = call i32 @_Z3barv(), !dbg !12 @@ -19,12 +19,9 @@ define i32 @_Z3bazv() #0 !dbg !4 { ret i32 %3, !dbg !14 } -declare i32 @_Z3fooii(i32, i32) #1 +declare i32 @_Z3fooii(i32, i32) -declare i32 @_Z3barv() #1 - -attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +declare i32 @_Z3barv() !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!8, !9} diff --git a/llvm/test/Transforms/AddDiscriminators/call.ll b/llvm/test/Transforms/AddDiscriminators/call.ll index 93d3aa4..11b21ef 100644 --- a/llvm/test/Transforms/AddDiscriminators/call.ll +++ b/llvm/test/Transforms/AddDiscriminators/call.ll @@ -8,7 +8,7 @@ ; #5 } ; Function Attrs: uwtable -define void @_Z3foov() #0 !dbg !4 { +define void @_Z3foov() !dbg !4 { call void @_Z3barv(), !dbg !10 ; CHECK: call void @_Z3barv(), !dbg ![[CALL0:[0-9]+]] %a = alloca [100 x i8], align 16 @@ -21,13 +21,10 @@ define void @_Z3foov() #0 !dbg !4 { ret void, !dbg !13 } -declare void @_Z3barv() #1 +declare void @_Z3barv() declare void @llvm.lifetime.start.p0(ptr nocapture) nounwind argmemonly declare void @llvm.lifetime.end.p0(ptr nocapture) nounwind argmemonly -attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!7, !8} !llvm.ident = !{!9} diff --git a/llvm/test/Transforms/AddDiscriminators/diamond.ll b/llvm/test/Transforms/AddDiscriminators/diamond.ll index c93a57a..9edcf39 100644 --- a/llvm/test/Transforms/AddDiscriminators/diamond.ll +++ b/llvm/test/Transforms/AddDiscriminators/diamond.ll @@ -12,7 +12,7 @@ ; bar(3): discriminator 2 ; Function Attrs: uwtable -define void @_Z3fooi(i32 %i) #0 !dbg !4 { +define void @_Z3fooi(i32 %i) !dbg !4 { %1 = alloca i32, align 4 store i32 %i, ptr %1, align 4 call void @llvm.dbg.declare(metadata ptr %1, metadata !11, metadata !12), !dbg !13 @@ -34,13 +34,9 @@ define void @_Z3fooi(i32 %i) #0 !dbg !4 { } ; Function Attrs: nounwind readnone -declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 +declare void @llvm.dbg.declare(metadata, metadata, metadata) -declare void @_Z3bari(i32) #2 - -attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone } -attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } +declare void @_Z3bari(i32) !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!8, !9} diff --git a/llvm/test/Transforms/AddDiscriminators/first-only.ll b/llvm/test/Transforms/AddDiscriminators/first-only.ll index 7ae9ed0..415e5f0 100644 --- a/llvm/test/Transforms/AddDiscriminators/first-only.ll +++ b/llvm/test/Transforms/AddDiscriminators/first-only.ll @@ -13,7 +13,7 @@ ; } ; } -define void @foo(i32 %i) #0 !dbg !4 { +define void @foo(i32 %i) !dbg !4 { entry: %i.addr = alloca i32, align 4 %x = alloca i32, align 4 @@ -44,8 +44,6 @@ if.end: ; preds = %if.then, %entry ; CHECK: ret void, !dbg ![[END:[0-9]+]] } -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!7, !8} !llvm.ident = !{!9} diff --git a/llvm/test/Transforms/AddDiscriminators/invoke.ll b/llvm/test/Transforms/AddDiscriminators/invoke.ll index d39014d..a3989b6 100644 --- a/llvm/test/Transforms/AddDiscriminators/invoke.ll +++ b/llvm/test/Transforms/AddDiscriminators/invoke.ll @@ -5,14 +5,14 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.14.0" ; Function Attrs: ssp uwtable -define void @_Z3foov() #0 personality ptr @__gxx_personality_v0 !dbg !8 { +define void @_Z3foov() personality ptr @__gxx_personality_v0 !dbg !8 { entry: %exn.slot = alloca ptr %ehselector.slot = alloca i32 ; CHECK: call void @_Z12bar_noexceptv({{.*}} !dbg ![[CALL1:[0-9]+]] - call void @_Z12bar_noexceptv() #4, !dbg !11 + call void @_Z12bar_noexceptv(), !dbg !11 ; CHECK: call void @_Z12bar_noexceptv({{.*}} !dbg ![[CALL2:[0-9]+]] - call void @_Z12bar_noexceptv() #4, !dbg !13 + call void @_Z12bar_noexceptv(), !dbg !13 invoke void @_Z3barv() ; CHECK: unwind label {{.*}} !dbg ![[INVOKE:[0-9]+]] to label %invoke.cont unwind label %lpad, !dbg !14 @@ -31,8 +31,8 @@ lpad: ; preds = %entry catch: ; preds = %lpad %exn = load ptr, ptr %exn.slot, align 8, !dbg !15 - %3 = call ptr @__cxa_begin_catch(ptr %exn) #4, !dbg !15 - invoke void @__cxa_rethrow() #5 + %3 = call ptr @__cxa_begin_catch(ptr %exn), !dbg !15 + invoke void @__cxa_rethrow() to label %unreachable unwind label %lpad1, !dbg !17 lpad1: ; preds = %catch @@ -62,7 +62,7 @@ terminate.lpad: ; preds = %lpad1 %7 = landingpad { ptr, i32 } catch ptr null, !dbg !20 %8 = extractvalue { ptr, i32 } %7, 0, !dbg !20 - call void @__clang_call_terminate(ptr %8) #6, !dbg !20 + call void @__clang_call_terminate(ptr %8), !dbg !20 unreachable, !dbg !20 unreachable: ; preds = %catch @@ -70,9 +70,9 @@ unreachable: ; preds = %catch } ; Function Attrs: nounwind -declare void @_Z12bar_noexceptv() #1 +declare void @_Z12bar_noexceptv() -declare void @_Z3barv() #2 +declare void @_Z3barv() declare i32 @__gxx_personality_v0(...) @@ -83,22 +83,14 @@ declare void @__cxa_rethrow() declare void @__cxa_end_catch() ; Function Attrs: noinline noreturn nounwind -define linkonce_odr hidden void @__clang_call_terminate(ptr) #3 { - %2 = call ptr @__cxa_begin_catch(ptr %0) #4 - call void @_ZSt9terminatev() #6 +define linkonce_odr hidden void @__clang_call_terminate(ptr) { + %2 = call ptr @__cxa_begin_catch(ptr %0) + call void @_ZSt9terminatev() unreachable } declare void @_ZSt9terminatev() -attributes #0 = { ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { noinline noreturn nounwind } -attributes #4 = { nounwind } -attributes #5 = { noreturn } -attributes #6 = { noreturn nounwind } - !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4, !5, !6} !llvm.ident = !{!7} diff --git a/llvm/test/Transforms/AddDiscriminators/multiple.ll b/llvm/test/Transforms/AddDiscriminators/multiple.ll index 54c1a5d..8e8ca6a 100644 --- a/llvm/test/Transforms/AddDiscriminators/multiple.ll +++ b/llvm/test/Transforms/AddDiscriminators/multiple.ll @@ -10,7 +10,7 @@ ; The two stores inside the if-then-else line must have different discriminator ; values. -define void @foo(i32 %i) #0 !dbg !4 { +define void @foo(i32 %i) !dbg !4 { entry: %i.addr = alloca i32, align 4 %x = alloca i32, align 4 @@ -45,8 +45,6 @@ if.end: ; preds = %if.else, %if.then ret void, !dbg !12 } -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!7, !8} !llvm.ident = !{!9} diff --git a/llvm/test/Transforms/AddDiscriminators/no-discriminators.ll b/llvm/test/Transforms/AddDiscriminators/no-discriminators.ll index c23edd6..f84579b 100644 --- a/llvm/test/Transforms/AddDiscriminators/no-discriminators.ll +++ b/llvm/test/Transforms/AddDiscriminators/no-discriminators.ll @@ -12,7 +12,7 @@ ; altered. If they are, it means that the discriminators pass added a ; new lexical scope. -define i32 @foo(i64 %i) #0 !dbg !4 { +define i32 @foo(i64 %i) !dbg !4 { entry: %retval = alloca i32, align 4 %i.addr = alloca i64, align 8 @@ -39,10 +39,7 @@ return: ; preds = %if.else, %if.then } ; Function Attrs: nounwind readnone -declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 - -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone } +declare void @llvm.dbg.declare(metadata, metadata, metadata) ; We should be able to add discriminators even in the absence of llvm.dbg.cu. ; When using sample profiles, the front end will generate line tables but it diff --git a/llvm/test/Transforms/AddDiscriminators/oneline.ll b/llvm/test/Transforms/AddDiscriminators/oneline.ll index 533d547..fc1675b 100644 --- a/llvm/test/Transforms/AddDiscriminators/oneline.ll +++ b/llvm/test/Transforms/AddDiscriminators/oneline.ll @@ -10,7 +10,7 @@ ; return 100: discriminator 4 ; return 99: discriminator 6 -define i32 @_Z3fooi(i32 %i) #0 !dbg !4 { +define i32 @_Z3fooi(i32 %i) !dbg !4 { %1 = alloca i32, align 4 %2 = alloca i32, align 4 store i32 %i, ptr %2, align 4, !tbaa !13 @@ -49,10 +49,7 @@ define i32 @_Z3fooi(i32 %i) #0 !dbg !4 { } ; Function Attrs: nounwind readnone -declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 - -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone } +declare void @llvm.dbg.declare(metadata, metadata, metadata) !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!10, !11} diff --git a/llvm/test/Transforms/Attributor/reduced/register_benchmark_test.ll b/llvm/test/Transforms/Attributor/reduced/register_benchmark_test.ll index eb7d78f..4704238 100644 --- a/llvm/test/Transforms/Attributor/reduced/register_benchmark_test.ll +++ b/llvm/test/Transforms/Attributor/reduced/register_benchmark_test.ll @@ -1557,24 +1557,24 @@ declare dso_local void @_GLOBAL__sub_I_register_benchmark_test.cc() #0 section " ; Function Attrs: cold noreturn nounwind declare void @llvm.trap() #20 -attributes #0 = { uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" } +attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" } +attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" } attributes #3 = { nounwind } -attributes #4 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" } attributes #5 = { argmemonly nounwind willreturn } -attributes #6 = { alwaysinline uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #7 = { alwaysinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #8 = { nobuiltin "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #9 = { nobuiltin nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #10 = { inlinehint uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #11 = { inlinehint nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #12 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #13 = { norecurse uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { alwaysinline uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" } +attributes #7 = { alwaysinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" } +attributes #8 = { nobuiltin "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" } +attributes #9 = { nobuiltin nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" } +attributes #10 = { inlinehint uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" } +attributes #11 = { inlinehint nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" } +attributes #12 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" } +attributes #13 = { norecurse uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" } attributes #14 = { nounwind readnone willreturn } -attributes #15 = { noreturn "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #15 = { noreturn "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" } attributes #16 = { noinline noreturn nounwind } attributes #17 = { argmemonly nounwind willreturn writeonly } -attributes #18 = { noreturn uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #19 = { inlinehint noreturn uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #18 = { noreturn uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" } +attributes #19 = { inlinehint noreturn uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" } attributes #20 = { cold noreturn nounwind } diff --git a/llvm/test/Transforms/CodeGenPrepare/ARM/bitreverse-recognize.ll b/llvm/test/Transforms/CodeGenPrepare/ARM/bitreverse-recognize.ll index d272fef..189186b 100644 --- a/llvm/test/Transforms/CodeGenPrepare/ARM/bitreverse-recognize.ll +++ b/llvm/test/Transforms/CodeGenPrepare/ARM/bitreverse-recognize.ll @@ -4,7 +4,7 @@ target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" target triple = "armv7--linux-gnueabihf" ; CHECK-LABEL: @f -define i32 @f(i32 %a) #0 { +define i32 @f(i32 %a) { ; CHECK: call i32 @llvm.bitreverse.i32 entry: br label %for.body @@ -25,8 +25,6 @@ for.body: ; preds = %for.body, %entry br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !3 } -attributes #0 = { norecurse nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.module.flags = !{!0, !1} !llvm.ident = !{!2} diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/bitreverse-hang.ll b/llvm/test/Transforms/CodeGenPrepare/X86/bitreverse-hang.ll index eec0967..35115cf 100644 --- a/llvm/test/Transforms/CodeGenPrepare/X86/bitreverse-hang.ll +++ b/llvm/test/Transforms/CodeGenPrepare/X86/bitreverse-hang.ll @@ -21,7 +21,7 @@ @b = common global i32 0, align 4 ; CHECK: define i32 @fn1 -define i32 @fn1() #0 { +define i32 @fn1() { entry: %b.promoted = load i32, ptr @b, align 4, !tbaa !2 br label %for.body @@ -40,8 +40,6 @@ for.end: ; preds = %for.body ret i32 undef } -attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.module.flags = !{!0} !llvm.ident = !{!1} diff --git a/llvm/test/Transforms/CodeGenPrepare/dom-tree.ll b/llvm/test/Transforms/CodeGenPrepare/dom-tree.ll index 1c990ff..14360fe 100644 --- a/llvm/test/Transforms/CodeGenPrepare/dom-tree.ll +++ b/llvm/test/Transforms/CodeGenPrepare/dom-tree.ll @@ -10,7 +10,7 @@ target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" target triple = "armv7--linux-gnueabihf" -define i32 @f(i32 %a) #0 { +define i32 @f(i32 %a) { entry: br label %for.body @@ -30,8 +30,6 @@ for.body: br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !3 } -attributes #0 = { norecurse nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.module.flags = !{!0, !1} !llvm.ident = !{!2} diff --git a/llvm/test/Transforms/ConstantHoisting/X86/ehpad.ll b/llvm/test/Transforms/ConstantHoisting/X86/ehpad.ll index 46fa066..78760db 100644 --- a/llvm/test/Transforms/ConstantHoisting/X86/ehpad.ll +++ b/llvm/test/Transforms/ConstantHoisting/X86/ehpad.ll @@ -20,7 +20,7 @@ target triple = "x86_64-pc-windows-msvc" ; BFIHOIST: br label %endif ; Function Attrs: norecurse -define i32 @main(i32 %argc, ptr nocapture readnone %argv) local_unnamed_addr #0 personality ptr @__CxxFrameHandler3 { +define i32 @main(i32 %argc, ptr nocapture readnone %argv) local_unnamed_addr personality ptr @__CxxFrameHandler3 { %call = tail call i64 @fn(i64 0) %call1 = tail call i64 @fn(i64 1) %tobool = icmp eq i32 %argc, 0 @@ -62,9 +62,6 @@ endif: ret i32 0 } -declare i64 @fn(i64) local_unnamed_addr #1 +declare i64 @fn(i64) local_unnamed_addr declare i32 @__CxxFrameHandler3(...) - -attributes #0 = { norecurse "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/Transforms/Coroutines/coro-debug.ll b/llvm/test/Transforms/Coroutines/coro-debug.ll index d1f1922..109be51f 100644 --- a/llvm/test/Transforms/Coroutines/coro-debug.ll +++ b/llvm/test/Transforms/Coroutines/coro-debug.ll @@ -14,7 +14,7 @@ entry: %0 = call token @llvm.coro.id(i32 0, ptr null, ptr @flink, ptr null), !dbg !16 %1 = call i64 @llvm.coro.size.i64(), !dbg !16 %call = call ptr @malloc(i64 %1), !dbg !16 - %2 = call ptr @llvm.coro.begin(token %0, ptr %call) #7, !dbg !16 + %2 = call ptr @llvm.coro.begin(token %0, ptr %call), !dbg !16 store ptr %2, ptr %coro_hdl, align 8, !dbg !16 %3 = call i8 @llvm.coro.suspend(token none, i1 false), !dbg !17 %conv = sext i8 %3 to i32, !dbg !17 @@ -69,7 +69,7 @@ coro_Cleanup: ; preds = %sw.epilog, %sw.bb1 br label %coro_Suspend, !dbg !24 coro_Suspend: ; preds = %coro_Cleanup, %sw.default - call void @llvm.coro.end(ptr null, i1 false, token none) #7, !dbg !24 + call void @llvm.coro.end(ptr null, i1 false, token none), !dbg !24 %7 = load ptr, ptr %coro_hdl, align 8, !dbg !24 store i32 0, ptr %late_local, !dbg !24 ret ptr %7, !dbg !24 @@ -82,47 +82,40 @@ ehcleanup: } ; Function Attrs: nounwind readnone speculatable -declare void @llvm.dbg.value(metadata, metadata, metadata) #1 +declare void @llvm.dbg.value(metadata, metadata, metadata) ; Function Attrs: nounwind readnone speculatable -declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 +declare void @llvm.dbg.declare(metadata, metadata, metadata) ; Function Attrs: argmemonly nounwind readonly -declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #2 +declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) -declare ptr @malloc(i64) #3 +declare ptr @malloc(i64) declare ptr @allocate() declare void @print({ ptr, i32 }) declare void @log() ; Function Attrs: nounwind readnone -declare i64 @llvm.coro.size.i64() #4 +declare i64 @llvm.coro.size.i64() ; Function Attrs: nounwind -declare ptr @llvm.coro.begin(token, ptr writeonly) #5 +declare ptr @llvm.coro.begin(token, ptr writeonly) ; Function Attrs: nounwind -declare i8 @llvm.coro.suspend(token, i1) #5 +declare i8 @llvm.coro.suspend(token, i1) -declare void @free(ptr) #3 +declare void @free(ptr) ; Function Attrs: argmemonly nounwind readonly -declare ptr @llvm.coro.free(token, ptr nocapture readonly) #2 +declare ptr @llvm.coro.free(token, ptr nocapture readonly) ; Function Attrs: nounwind -declare void @llvm.coro.end(ptr, i1, token) #5 +declare void @llvm.coro.end(ptr, i1, token) ; Function Attrs: argmemonly nounwind readonly -declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #2 - -attributes #0 = { noinline nounwind presplitcoroutine "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone speculatable } -attributes #2 = { argmemonly nounwind readonly } -attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #4 = { nounwind readnone } -attributes #5 = { nounwind } -attributes #6 = { alwaysinline } -attributes #7 = { noduplicate } +declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) + +attributes #0 = { noinline nounwind presplitcoroutine } !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4} diff --git a/llvm/test/Transforms/Coroutines/coro-split-dbg.ll b/llvm/test/Transforms/Coroutines/coro-split-dbg.ll index c53bea8..577ca9a 100644 --- a/llvm/test/Transforms/Coroutines/coro-split-dbg.ll +++ b/llvm/test/Transforms/Coroutines/coro-split-dbg.ll @@ -6,9 +6,9 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" ; Function Attrs: nounwind readnone -declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 +declare void @llvm.dbg.declare(metadata, metadata, metadata) -declare void @bar(...) local_unnamed_addr #2 +declare void @bar(...) local_unnamed_addr ; Function Attrs: nounwind uwtable define ptr @f() #3 !dbg !16 { @@ -16,14 +16,14 @@ entry: %0 = tail call token @llvm.coro.id(i32 0, ptr null, ptr @f, ptr null), !dbg !26 %1 = tail call i64 @llvm.coro.size.i64(), !dbg !26 %call = tail call ptr @malloc(i64 %1), !dbg !26 - %2 = tail call ptr @llvm.coro.begin(token %0, ptr %call) #9, !dbg !26 + %2 = tail call ptr @llvm.coro.begin(token %0, ptr %call), !dbg !26 tail call void @llvm.dbg.value(metadata ptr %2, metadata !21, metadata !12), !dbg !26 br label %for.cond, !dbg !27 for.cond: ; preds = %for.cond, %entry tail call void @llvm.dbg.value(metadata i32 undef, metadata !22, metadata !12), !dbg !28 - tail call void @llvm.dbg.value(metadata i32 undef, metadata !11, metadata !12) #7, !dbg !29 - tail call void (...) @bar() #7, !dbg !33 + tail call void @llvm.dbg.value(metadata i32 undef, metadata !11, metadata !12), !dbg !29 + tail call void (...) @bar(), !dbg !33 %3 = tail call token @llvm.coro.save(ptr null), !dbg !34 %4 = tail call i8 @llvm.coro.suspend(token %3, i1 false), !dbg !34 %conv = sext i8 %4 to i32, !dbg !34 @@ -38,40 +38,31 @@ coro_Cleanup: ; preds = %for.cond br label %coro_Suspend, !dbg !36 coro_Suspend: ; preds = %for.cond, %if.then, %coro_Cleanup - tail call void @llvm.coro.end(ptr null, i1 false, token none) #9, !dbg !38 + tail call void @llvm.coro.end(ptr null, i1 false, token none), !dbg !38 ret ptr %2, !dbg !39 } ; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.start.p0(ptr nocapture) #4 +declare void @llvm.lifetime.start.p0(ptr nocapture) ; Function Attrs: argmemonly nounwind readonly -declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #5 +declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) ; Function Attrs: nounwind -declare noalias ptr @malloc(i64) local_unnamed_addr #6 -declare i64 @llvm.coro.size.i64() #1 -declare ptr @llvm.coro.begin(token, ptr writeonly) #7 -declare token @llvm.coro.save(ptr) #7 -declare i8 @llvm.coro.suspend(token, i1) #7 -declare void @llvm.lifetime.end.p0(ptr nocapture) #4 -declare ptr @llvm.coro.free(token, ptr nocapture readonly) #5 -declare void @free(ptr nocapture) local_unnamed_addr #6 -declare void @llvm.coro.end(ptr, i1, token) #7 -declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #5 +declare noalias ptr @malloc(i64) local_unnamed_addr +declare i64 @llvm.coro.size.i64() +declare ptr @llvm.coro.begin(token, ptr writeonly) +declare token @llvm.coro.save(ptr) +declare i8 @llvm.coro.suspend(token, i1) +declare void @llvm.lifetime.end.p0(ptr nocapture) +declare ptr @llvm.coro.free(token, ptr nocapture readonly) +declare void @free(ptr nocapture) local_unnamed_addr +declare void @llvm.coro.end(ptr, i1, token) +declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) -declare void @llvm.dbg.value(metadata, metadata, metadata) #1 +declare void @llvm.dbg.value(metadata, metadata, metadata) -attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone } -attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { nounwind uwtable presplitcoroutine "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #4 = { argmemonly nounwind } -attributes #5 = { argmemonly nounwind readonly } -attributes #6 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #7 = { nounwind } -attributes #8 = { alwaysinline nounwind } -attributes #9 = { noduplicate } +attributes #3 = { nounwind uwtable presplitcoroutine } !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4} diff --git a/llvm/test/Transforms/DeadStoreElimination/matrix-intrinsics.ll b/llvm/test/Transforms/DeadStoreElimination/matrix-intrinsics.ll new file mode 100644 index 0000000..2eaa275 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/matrix-intrinsics.ll @@ -0,0 +1,332 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes=dse -S %s | FileCheck %s + +define void @dead_unstrided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @dead_unstrided_store_non_matrix_load( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[L:%.*]] = load double, ptr [[SRC]], align 8 +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2) + %l = load double, ptr %src + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2) + ret void +} + +define void @live_unstrided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @live_unstrided_store_non_matrix_load( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[L_1:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L_2:%.*]] = load double, ptr [[DST]], align 8 +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L_1]], ptr [[DST]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + %l.1 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2) + %l.2 = load double, ptr %dst + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l.1, ptr %dst, i32 4, i1 false, i32 4, i32 2) + ret void +} + +define void @dead_strided_store(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @dead_strided_store( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 100, i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 200, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 100, i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 100, i1 false, i32 4, i32 2) + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 200, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 100, i1 false, i32 4, i32 2) + ret void +} + +define void @live_strided_store(ptr %ptr) { +; CHECK-LABEL: define void @live_strided_store( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[PTR]], i32 100, i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[PTR]], i32 200, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[PTR]], i32 100, i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %ptr, i32 100, i1 false, i32 4, i32 2) + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %ptr, i32 200, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %ptr, i32 100, i1 false, i32 4, i32 2) + ret void +} + +define void @dead_strided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @dead_strided_store_non_matrix_load( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[L_1:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 100, i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L:%.*]] = load double, ptr [[SRC]], align 8 +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L_1]], ptr [[DST]], i32 100, i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + %l.1 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 100, i1 false, i32 4, i32 2) + %l.2 = load double, ptr %src + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l.1, ptr %dst, i32 100, i1 false, i32 4, i32 2) + ret void +} + +define void @live_strided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @live_strided_store_non_matrix_load( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[L_1:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 100, i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L_2:%.*]] = load double, ptr [[DST]], align 8 +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L_1]], ptr [[DST]], i32 100, i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + %l.1 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 100, i1 false, i32 4, i32 2) + %l.2 = load double, ptr %dst + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l.1, ptr %dst, i32 100, i1 false, i32 4, i32 2) + ret void +} + +define void @dead_dynamically_strided_store(ptr noalias %src, ptr noalias %dst, i32 %stride) { +; CHECK-LABEL: define void @dead_dynamically_strided_store( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[STRIDE:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 [[STRIDE]], i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 [[STRIDE]], i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 %stride, i1 false, i32 4, i32 2) + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 %stride, i1 false, i32 4, i32 2) + ret void +} + +define void @live_dynamically_strided_store(ptr %ptr, i32 %stride) { +; CHECK-LABEL: define void @live_dynamically_strided_store( +; CHECK-SAME: ptr [[PTR:%.*]], i32 [[STRIDE:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[PTR]], i32 [[STRIDE]], i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[PTR]], i32 [[STRIDE]], i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[PTR]], i32 [[STRIDE]], i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %ptr, i32 %stride, i1 false, i32 4, i32 2) + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %ptr, i32 %stride, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %ptr, i32 %stride, i1 false, i32 4, i32 2) + ret void +} + +define void @dead_dynamically_strided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst, i32 %stride) { +; CHECK-LABEL: define void @dead_dynamically_strided_store_non_matrix_load( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[STRIDE:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[L_1:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 [[STRIDE]], i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L:%.*]] = load double, ptr [[SRC]], align 8 +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L_1]], ptr [[DST]], i32 [[STRIDE]], i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + %l.1 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 %stride, i1 false, i32 4, i32 2) + %l.2 = load double, ptr %src + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l.1, ptr %dst, i32 %stride, i1 false, i32 4, i32 2) + ret void +} + +define void @live_dynamically_strided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst, i32 %stride) { +; CHECK-LABEL: define void @live_dynamically_strided_store_non_matrix_load( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[STRIDE:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[L_1:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 [[STRIDE]], i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L_2:%.*]] = load double, ptr [[DST]], align 8 +; CHECK-NEXT: ret void +; +entry: + %l.1 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 %stride, i1 false, i32 4, i32 2) + %l.2 = load double, ptr %dst + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 %stride, i1 false, i32 4, i32 2) + ret void +} + +define void @dead_unstrided_store(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @dead_unstrided_store( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2) + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 4, i1 false, i32 4, i32 2) + ret void +} + +define void @live_unstrided_store(ptr %ptr) { +; CHECK-LABEL: define void @live_unstrided_store( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[PTR]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[PTR]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[PTR]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %ptr, i32 4, i1 false, i32 4, i32 2) + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %ptr, i32 4, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %ptr, i32 4, i1 false, i32 4, i32 2) + ret void +} + +define void @dead_non_matrix_store(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @dead_non_matrix_store( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[DST_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 6 +; CHECK-NEXT: store double 4.200000e+01, ptr [[DST_OFFSET]], align 8 +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + %dst.offset = getelementptr inbounds double, ptr %src, i32 6 + store double 42.0, ptr %dst.offset + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 4, i1 false, i32 4, i32 2) + ret void +} + +define void @live_non_matrix_store(ptr %ptr) { +; CHECK-LABEL: define void @live_non_matrix_store( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[PTR_OFFSET:%.*]] = getelementptr inbounds double, ptr [[PTR]], i32 6 +; CHECK-NEXT: store double 4.200000e+01, ptr [[PTR_OFFSET]], align 8 +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[PTR]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[PTR]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + %ptr.offset = getelementptr inbounds double, ptr %ptr, i32 6 + store double 42.0, ptr %ptr.offset + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %ptr, i32 4, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %ptr, i32 4, i1 false, i32 4, i32 2) + ret void +} + +define void @dead_matrix_store_non_matrix_overwrite_unstrided(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @dead_matrix_store_non_matrix_overwrite_unstrided( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: store <8 x double> zeroinitializer, ptr [[DST]], align 64 +; CHECK-NEXT: ret void +; +entry: + call void @llvm.matrix.column.major.store(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2) + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2) + store <8 x double> zeroinitializer, ptr %dst + ret void +} + +define void @dead_matrix_store_non_matrix_overwrite_strided(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @dead_matrix_store_non_matrix_overwrite_strided( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 8, i1 false, i32 4, i32 2) +; CHECK-NEXT: store <16 x double> zeroinitializer, ptr [[DST]], align 128 +; CHECK-NEXT: ret void +; +entry: + call void @llvm.matrix.column.major.store(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2) + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 8, i1 false, i32 4, i32 2) + store <16 x double> zeroinitializer, ptr %dst + ret void +} + +define void @live_matrix_store_non_matrix_overwrite_unstrided(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @live_matrix_store_non_matrix_overwrite_unstrided( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: store <4 x double> zeroinitializer, ptr [[DST]], align 32 +; CHECK-NEXT: ret void +; +entry: + call void @llvm.matrix.column.major.store(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2) + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2) + store <4 x double> zeroinitializer, ptr %dst + ret void +} + +define void @live_matrix_store_non_matrix_overwrite_strided(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @live_matrix_store_non_matrix_overwrite_strided( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 8, i1 false, i32 4, i32 2) +; CHECK-NEXT: store <8 x double> zeroinitializer, ptr [[DST]], align 64 +; CHECK-NEXT: ret void +; +entry: + call void @llvm.matrix.column.major.store(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2) + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 8, i1 false, i32 4, i32 2) + store <8 x double> zeroinitializer, ptr %dst + ret void +} + +define void @dead_matrix_store_dimension_change(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @dead_matrix_store_dimension_change( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v9f64.i32(<9 x double> zeroinitializer, ptr [[DST]], i32 3, i1 false, i32 3, i32 3) +; CHECK-NEXT: ret void +; +entry: + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 8, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 4, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v9f64.i32(<9 x double> zeroinitializer, ptr %dst, i32 3, i1 false, i32 3, i32 3) + ret void +} + +define void @live_matrix_store_dimension_change(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @live_matrix_store_dimension_change( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 8, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v9f64.i32(<9 x double> zeroinitializer, ptr [[DST]], i32 3, i1 false, i32 3, i32 3) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 8, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v9f64.i32(<9 x double> zeroinitializer, ptr %dst, i32 3, i1 false, i32 3, i32 3) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 4, i1 false, i32 4, i32 2) + ret void +} + +declare <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr, i32, i1, i32, i32) +declare <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr, i32, i1, i32, i32) +declare void @llvm.matrix.column.major.store.v8f64.i32(<8 x double>, ptr, i32, i1, i32, i32) diff --git a/llvm/test/Transforms/GVN/matrix-intrinsics.ll b/llvm/test/Transforms/GVN/matrix-intrinsics.ll new file mode 100644 index 0000000..03bd45b --- /dev/null +++ b/llvm/test/Transforms/GVN/matrix-intrinsics.ll @@ -0,0 +1,132 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes=gvn -S %s | FileCheck %s + +define void @redundant_unstrided_load(ptr %src) { +; CHECK-LABEL: define void @redundant_unstrided_load( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 8 +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @use(<8 x double> [[L]]) +; CHECK-NEXT: call void @use(<8 x double> [[L]]) +; CHECK-NEXT: ret void +; +entry: + %src.offset = getelementptr inbounds double, ptr %src, i32 8 + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 4, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64(<8 x double> %l, ptr %src, i32 4, i1 false, i32 4, i32 2) + %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 4, i1 false, i32 4, i32 2) + call void @use(<8 x double> %l) + call void @use(<8 x double> %l.2) + ret void +} + +define void @redundant_unstrided_load_non_matrix_store(ptr %src) { +; CHECK-LABEL: define void @redundant_unstrided_load_non_matrix_store( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 1 +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: store double 4.200000e+01, ptr [[SRC]], align 8 +; CHECK-NEXT: call void @use(<8 x double> [[L]]) +; CHECK-NEXT: call void @use(<8 x double> [[L]]) +; CHECK-NEXT: ret void +; +entry: + %src.offset = getelementptr inbounds double, ptr %src, i32 1 + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 4, i1 false, i32 4, i32 2) + store double 42.0, ptr %src + %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 4, i1 false, i32 4, i32 2) + call void @use(<8 x double> %l) + call void @use(<8 x double> %l.2) + ret void +} + +define void @redundant_strided_load(ptr %src) { +; CHECK-LABEL: define void @redundant_strided_load( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 16 +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 8, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[SRC]], i32 8, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @use(<8 x double> [[L]]) +; CHECK-NEXT: call void @use(<8 x double> [[L]]) +; CHECK-NEXT: ret void +; +entry: + %src.offset = getelementptr inbounds double, ptr %src, i32 16 + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2) + %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) + call void @use(<8 x double> %l) + call void @use(<8 x double> %l.2) + ret void + +} + +define void @redundant_strided_load_non_matrix_store(ptr %src) { +; CHECK-LABEL: define void @redundant_strided_load_non_matrix_store( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 16 +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 8, i1 false, i32 4, i32 2) +; CHECK-NEXT: store double 4.200000e+01, ptr [[SRC]], align 8 +; CHECK-NEXT: call void @use(<8 x double> [[L]]) +; CHECK-NEXT: call void @use(<8 x double> [[L]]) +; CHECK-NEXT: ret void +; +entry: + %src.offset = getelementptr inbounds double, ptr %src, i32 16 + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) + store double 42.0, ptr %src + %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) + call void @use(<8 x double> %l) + call void @use(<8 x double> %l.2) + ret void +} + +define void @repeat_load_dimension_change_project(ptr %src) { +; CHECK-LABEL: define void @repeat_load_dimension_change_project( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L_2:%.*]] = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr [[SRC]], i32 3, i1 false, i32 3, i32 3) +; CHECK-NEXT: [[L_3:%.*]] = shufflevector <9 x double> [[L_2]], <9 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> +; CHECK-NEXT: call void @use(<8 x double> [[L]]) +; CHECK-NEXT: call void @use(<8 x double> [[L_3]]) +; CHECK-NEXT: ret void +; +entry: + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2) + %l.2 = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr %src, i32 3, i1 false, i32 3, i32 3) + %l.3 = shufflevector <9 x double> %l.2, <9 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + call void @use(<8 x double> %l) + call void @use(<8 x double> %l.3) + ret void +} + +define void @repeat_load_dimension_change_shuffle(ptr %src) { +; CHECK-LABEL: define void @repeat_load_dimension_change_shuffle( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L_2:%.*]] = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr [[SRC]], i32 3, i1 false, i32 3, i32 3) +; CHECK-NEXT: [[L_3:%.*]] = shufflevector <9 x double> [[L_2]], <9 x double> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> +; CHECK-NEXT: call void @use(<8 x double> [[L]]) +; CHECK-NEXT: call void @use(<8 x double> [[L_3]]) +; CHECK-NEXT: ret void +; +entry: + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2) + %l.2 = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr %src, i32 3, i1 false, i32 3, i32 3) + %l.3 = shufflevector <9 x double> %l.2, <9 x double> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + call void @use(<8 x double> %l) + call void @use(<8 x double> %l.3) + ret void +} + +declare <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr, i32, i1, i32, i32) +declare <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr, i32, i1, i32, i32) +declare void @llvm.matrix.column.major.store.v8f64.i32(<8 x double>, ptr, i32, i1, i32, i32) +declare void @use(<8 x double>) diff --git a/llvm/test/Transforms/IROutliner/outlining-compatible-and-attribute-transfer.ll b/llvm/test/Transforms/IROutliner/outlining-compatible-and-attribute-transfer.ll index b3f2e81..15ce3e3 100644 --- a/llvm/test/Transforms/IROutliner/outlining-compatible-and-attribute-transfer.ll +++ b/llvm/test/Transforms/IROutliner/outlining-compatible-and-attribute-transfer.ll @@ -5,7 +5,7 @@ ; attributes that should be transferred only if it is on all of the regions. ; This includes the attributes, no-nans-fp-math, -; no-signed-zeros-fp-math, less-precise-fpmad, unsafe-fp-math, and +; no-signed-zeros-fp-math, less-precise-fpmad, and ; no-infs-fp-math. Only when each instance of similarity has these attributes ; can we say that the outlined function can have these attributes since that ; is the more general case for these attributes. @@ -101,7 +101,7 @@ entry: } attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "less-precise-fpmad"="true" -"unsafe-fp-math"="true" "no-infs-fp-math"="true"} +"no-infs-fp-math"="true"} ; CHECK: define internal void @outlined_ir_func_0(ptr [[ARG0:%.*]], ptr [[ARG1:%.*]], ptr [[ARG2:%.*]]) [[ATTR1:#[0-9]+]] { ; CHECK: entry_to_outline: @@ -122,5 +122,5 @@ attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "les ; CHECK-NEXT: [[CL:%.*]] = load i32, ptr [[ARG2]], align 4 -; CHECK: attributes [[ATTR1]] = { minsize optsize "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "unsafe-fp-math"="false" } -; CHECK: attributes [[ATTR]] = { minsize optsize "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "unsafe-fp-math"="true" } +; CHECK: attributes [[ATTR1]] = { minsize optsize "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" } +; CHECK: attributes [[ATTR]] = { minsize optsize "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" } diff --git a/llvm/test/Transforms/Inline/attributes.ll b/llvm/test/Transforms/Inline/attributes.ll index 55ab430..da7eeda 100644 --- a/llvm/test/Transforms/Inline/attributes.ll +++ b/llvm/test/Transforms/Inline/attributes.ll @@ -601,46 +601,6 @@ define i32 @test_no-signed-zeros-fp-math3(i32 %i) "no-signed-zeros-fp-math"="tru ; CHECK-NEXT: ret i32 } -define i32 @unsafe-fp-math_callee0(i32 %i) "unsafe-fp-math"="false" { - ret i32 %i -; CHECK: @unsafe-fp-math_callee0(i32 %i) [[UNSAFE_FPMATH_FALSE:#[0-9]+]] { -; CHECK-NEXT: ret i32 -} - -define i32 @unsafe-fp-math_callee1(i32 %i) "unsafe-fp-math"="true" { - ret i32 %i -; CHECK: @unsafe-fp-math_callee1(i32 %i) [[UNSAFE_FPMATH_TRUE:#[0-9]+]] { -; CHECK-NEXT: ret i32 -} - -define i32 @test_unsafe-fp-math0(i32 %i) "unsafe-fp-math"="false" { - %1 = call i32 @unsafe-fp-math_callee0(i32 %i) - ret i32 %1 -; CHECK: @test_unsafe-fp-math0(i32 %i) [[UNSAFE_FPMATH_FALSE]] { -; CHECK-NEXT: ret i32 -} - -define i32 @test_unsafe-fp-math1(i32 %i) "unsafe-fp-math"="false" { - %1 = call i32 @unsafe-fp-math_callee1(i32 %i) - ret i32 %1 -; CHECK: @test_unsafe-fp-math1(i32 %i) [[UNSAFE_FPMATH_FALSE]] { -; CHECK-NEXT: ret i32 -} - -define i32 @test_unsafe-fp-math2(i32 %i) "unsafe-fp-math"="true" { - %1 = call i32 @unsafe-fp-math_callee0(i32 %i) - ret i32 %1 -; CHECK: @test_unsafe-fp-math2(i32 %i) [[UNSAFE_FPMATH_FALSE]] { -; CHECK-NEXT: ret i32 -} - -define i32 @test_unsafe-fp-math3(i32 %i) "unsafe-fp-math"="true" { - %1 = call i32 @unsafe-fp-math_callee1(i32 %i) - ret i32 %1 -; CHECK: @test_unsafe-fp-math3(i32 %i) [[UNSAFE_FPMATH_TRUE]] { -; CHECK-NEXT: ret i32 -} - ; Test that fn_ret_thunk_extern has no CompatRule; inlining is permitted. ; Test that fn_ret_thunk_extern has no MergeRule; fn_ret_thunk_extern is not ; propagated or dropped on the caller after inlining. @@ -693,6 +653,4 @@ define i32 @loader_replaceable_caller() { ; CHECK: attributes [[NO_NANS_FPMATH_TRUE]] = { "no-nans-fp-math"="true" } ; CHECK: attributes [[NO_SIGNED_ZEROS_FPMATH_FALSE]] = { "no-signed-zeros-fp-math"="false" } ; CHECK: attributes [[NO_SIGNED_ZEROS_FPMATH_TRUE]] = { "no-signed-zeros-fp-math"="true" } -; CHECK: attributes [[UNSAFE_FPMATH_FALSE]] = { "unsafe-fp-math"="false" } -; CHECK: attributes [[UNSAFE_FPMATH_TRUE]] = { "unsafe-fp-math"="true" } ; CHECK: attributes [[FNRETTHUNK_EXTERN]] = { fn_ret_thunk_extern } diff --git a/llvm/test/Transforms/InstCombine/intrinsic-select.ll b/llvm/test/Transforms/InstCombine/intrinsic-select.ll index 2f1f9fc..fc9ab9f 100644 --- a/llvm/test/Transforms/InstCombine/intrinsic-select.ll +++ b/llvm/test/Transforms/InstCombine/intrinsic-select.ll @@ -222,8 +222,7 @@ declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>) define i32 @vec_to_scalar_select_scalar(i1 %b) { ; CHECK-LABEL: @vec_to_scalar_select_scalar( -; CHECK-NEXT: [[S:%.*]] = select i1 [[B:%.*]], <2 x i32> <i32 1, i32 2>, <2 x i32> <i32 3, i32 4> -; CHECK-NEXT: [[C:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[S]]) +; CHECK-NEXT: [[C:%.*]] = select i1 [[B:%.*]], i32 3, i32 7 ; CHECK-NEXT: ret i32 [[C]] ; %s = select i1 %b, <2 x i32> <i32 1, i32 2>, <2 x i32> <i32 3, i32 4> @@ -371,3 +370,36 @@ define float @test_fabs_select_multiuse_both_constant(i1 %cond, float %x) { %fabs = call float @llvm.fabs.f32(float %select) ret float %fabs } + +; Negative test: Don't replace with select between vector mask and zeroinitializer. +define <16 x i1> @test_select_of_active_lane_mask_bound(i64 %base, i64 %n, i1 %cond) { +; CHECK-LABEL: @test_select_of_active_lane_mask_bound( +; CHECK-NEXT: [[S:%.*]] = select i1 [[COND:%.*]], i64 [[N:%.*]], i64 0 +; CHECK-NEXT: [[MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[BASE:%.*]], i64 [[S]]) +; CHECK-NEXT: ret <16 x i1> [[MASK]] +; + %s = select i1 %cond, i64 %n, i64 0 + %mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 %base, i64 %s) + ret <16 x i1> %mask +} + +define <16 x i1> @test_select_of_active_lane_mask_bound_both_constant(i64 %base, i64 %n, i1 %cond) { +; CHECK-LABEL: @test_select_of_active_lane_mask_bound_both_constant( +; CHECK-NEXT: [[MASK:%.*]] = select i1 [[COND:%.*]], <16 x i1> splat (i1 true), <16 x i1> zeroinitializer +; CHECK-NEXT: ret <16 x i1> [[MASK]] +; + %s = select i1 %cond, i64 16, i64 0 + %mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 0, i64 %s) + ret <16 x i1> %mask +} + +define { i64, i1 } @test_select_of_overflow_intrinsic_operand(i64 %n, i1 %cond) { +; CHECK-LABEL: @test_select_of_overflow_intrinsic_operand( +; CHECK-NEXT: [[TMP1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[N:%.*]], i64 42) +; CHECK-NEXT: [[ADD_OVERFLOW:%.*]] = select i1 [[COND:%.*]], { i64, i1 } [[TMP1]], { i64, i1 } { i64 42, i1 false } +; CHECK-NEXT: ret { i64, i1 } [[ADD_OVERFLOW]] +; + %s = select i1 %cond, i64 %n, i64 0 + %add_overflow = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %s, i64 42) + ret { i64, i1 } %add_overflow +} diff --git a/llvm/test/Transforms/InstCombine/phi.ll b/llvm/test/Transforms/InstCombine/phi.ll index 3454835..1c8b21c 100644 --- a/llvm/test/Transforms/InstCombine/phi.ll +++ b/llvm/test/Transforms/InstCombine/phi.ll @@ -3026,6 +3026,56 @@ join: ret i32 %umax } +define i32 @cross_lane_intrinsic_over_phi(i1 %c, i1 %c2, <4 x i32> %a) { +; CHECK-LABEL: @cross_lane_intrinsic_over_phi( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[IF:%.*]], label [[JOIN:%.*]] +; CHECK: if: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[A:%.*]]) +; CHECK-NEXT: br label [[JOIN]] +; CHECK: join: +; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[TMP0]], [[IF]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: call void @may_exit() +; CHECK-NEXT: ret i32 [[PHI]] +; +entry: + br i1 %c, label %if, label %join + +if: + br label %join + +join: + %phi = phi <4 x i32> [ %a, %if ], [ zeroinitializer, %entry ] + call void @may_exit() + %sum = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %phi) + ret i32 %sum +} + +define { i64, i1 } @overflow_intrinsic_over_phi(i1 %c, i64 %a) { +; CHECK-LABEL: @overflow_intrinsic_over_phi( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[IF:%.*]], label [[JOIN:%.*]] +; CHECK: if: +; CHECK-NEXT: [[TMP0:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A:%.*]], i64 1) +; CHECK-NEXT: br label [[JOIN]] +; CHECK: join: +; CHECK-NEXT: [[PHI:%.*]] = phi { i64, i1 } [ [[TMP0]], [[IF]] ], [ { i64 1, i1 false }, [[ENTRY:%.*]] ] +; CHECK-NEXT: call void @may_exit() +; CHECK-NEXT: ret { i64, i1 } [[PHI]] +; +entry: + br i1 %c, label %if, label %join + +if: + br label %join + +join: + %phi = phi i64 [ %a, %if ], [ 0, %entry ] + call void @may_exit() + %add_overflow = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %phi, i64 1) + ret { i64, i1 } %add_overflow +} + define i32 @multiple_intrinsics_with_multiple_phi_uses(i1 %c, i32 %arg) { ; CHECK-LABEL: @multiple_intrinsics_with_multiple_phi_uses( ; CHECK-NEXT: entry: diff --git a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll index 410c43c..f19cca8 100644 --- a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll +++ b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll @@ -40,3 +40,134 @@ define i128 @ptrtoaddr_sext(ptr %p) { %ext = sext i64 %p.addr to i128 ret i128 %ext } + +define i64 @sub_ptrtoaddr(ptr %p, i64 %offset) { +; CHECK-LABEL: define i64 @sub_ptrtoaddr( +; CHECK-SAME: ptr [[P:%.*]], i64 [[OFFSET:%.*]]) { +; CHECK-NEXT: ret i64 [[OFFSET]] +; + %p2 = getelementptr i8, ptr %p, i64 %offset + %p.addr = ptrtoaddr ptr %p to i64 + %p2.addr = ptrtoaddr ptr %p2 to i64 + %sub = sub i64 %p2.addr, %p.addr + ret i64 %sub +} + +define i64 @sub_ptrtoint_ptrtoaddr(ptr %p, i64 %offset) { +; CHECK-LABEL: define i64 @sub_ptrtoint_ptrtoaddr( +; CHECK-SAME: ptr [[P:%.*]], i64 [[OFFSET:%.*]]) { +; CHECK-NEXT: ret i64 [[OFFSET]] +; + %p2 = getelementptr i8, ptr %p, i64 %offset + %p.int = ptrtoint ptr %p to i64 + %p2.addr = ptrtoaddr ptr %p2 to i64 + %sub = sub i64 %p2.addr, %p.int + ret i64 %sub +} + +define i32 @sub_ptrtoaddr_addrsize(ptr addrspace(1) %p, i32 %offset) { +; CHECK-LABEL: define i32 @sub_ptrtoaddr_addrsize( +; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[OFFSET:%.*]]) { +; CHECK-NEXT: ret i32 [[OFFSET]] +; + %p2 = getelementptr i8, ptr addrspace(1) %p, i32 %offset + %p.addr = ptrtoaddr ptr addrspace(1) %p to i32 + %p2.addr = ptrtoaddr ptr addrspace(1) %p2 to i32 + %sub = sub i32 %p2.addr, %p.addr + ret i32 %sub +} + +define i32 @sub_trunc_ptrtoaddr(ptr %p, i64 %offset) { +; CHECK-LABEL: define i32 @sub_trunc_ptrtoaddr( +; CHECK-SAME: ptr [[P:%.*]], i64 [[OFFSET:%.*]]) { +; CHECK-NEXT: [[SUB:%.*]] = trunc i64 [[OFFSET]] to i32 +; CHECK-NEXT: ret i32 [[SUB]] +; + %p2 = getelementptr i8, ptr %p, i64 %offset + %p.addr = ptrtoaddr ptr %p to i64 + %p2.addr = ptrtoaddr ptr %p2 to i64 + %p.addr.trunc = trunc i64 %p.addr to i32 + %p2.addr.trunc = trunc i64 %p2.addr to i32 + %sub = sub i32 %p2.addr.trunc, %p.addr.trunc + ret i32 %sub +} + +define i16 @sub_trunc_ptrtoaddr_addrsize(ptr addrspace(1) %p, i32 %offset) { +; CHECK-LABEL: define i16 @sub_trunc_ptrtoaddr_addrsize( +; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[OFFSET:%.*]]) { +; CHECK-NEXT: [[SUB:%.*]] = trunc i32 [[OFFSET]] to i16 +; CHECK-NEXT: ret i16 [[SUB]] +; + %p2 = getelementptr i8, ptr addrspace(1) %p, i32 %offset + %p.addr = ptrtoaddr ptr addrspace(1) %p to i32 + %p2.addr = ptrtoaddr ptr addrspace(1) %p2 to i32 + %p.addr.trunc = trunc i32 %p.addr to i16 + %p2.addr.trunc = trunc i32 %p2.addr to i16 + %sub = sub i16 %p2.addr.trunc, %p.addr.trunc + ret i16 %sub +} + +define i16 @sub_trunc_ptrtoint_ptrtoaddr_addrsize(ptr addrspace(1) %p, i32 %offset) { +; CHECK-LABEL: define i16 @sub_trunc_ptrtoint_ptrtoaddr_addrsize( +; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[OFFSET:%.*]]) { +; CHECK-NEXT: [[SUB:%.*]] = trunc i32 [[OFFSET]] to i16 +; CHECK-NEXT: ret i16 [[SUB]] +; + %p2 = getelementptr i8, ptr addrspace(1) %p, i32 %offset + %p.int = ptrtoint ptr addrspace(1) %p to i64 + %p2.addr = ptrtoaddr ptr addrspace(1) %p2 to i32 + %p.int.trunc = trunc i64 %p.int to i16 + %p2.addr.trunc = trunc i32 %p2.addr to i16 + %sub = sub i16 %p2.addr.trunc, %p.int.trunc + ret i16 %sub +} + +define i128 @sub_zext_ptrtoaddr(ptr %p, i64 %offset) { +; CHECK-LABEL: define i128 @sub_zext_ptrtoaddr( +; CHECK-SAME: ptr [[P:%.*]], i64 [[OFFSET:%.*]]) { +; CHECK-NEXT: [[SUB:%.*]] = zext i64 [[OFFSET]] to i128 +; CHECK-NEXT: ret i128 [[SUB]] +; + %p2 = getelementptr nuw i8, ptr %p, i64 %offset + %p.addr = ptrtoaddr ptr %p to i64 + %p2.addr = ptrtoaddr ptr %p2 to i64 + %p.addr.ext = zext i64 %p.addr to i128 + %p2.addr.ext = zext i64 %p2.addr to i128 + %sub = sub i128 %p2.addr.ext, %p.addr.ext + ret i128 %sub +} + +define i64 @sub_zext_ptrtoaddr_addrsize(ptr addrspace(1) %p, i32 %offset) { +; CHECK-LABEL: define i64 @sub_zext_ptrtoaddr_addrsize( +; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[OFFSET:%.*]]) { +; CHECK-NEXT: [[SUB:%.*]] = zext i32 [[OFFSET]] to i64 +; CHECK-NEXT: ret i64 [[SUB]] +; + %p2 = getelementptr nuw i8, ptr addrspace(1) %p, i32 %offset + %p.addr = ptrtoaddr ptr addrspace(1) %p to i32 + %p2.addr = ptrtoaddr ptr addrspace(1) %p2 to i32 + %p.addr.ext = zext i32 %p.addr to i64 + %p2.addr.ext = zext i32 %p2.addr to i64 + %sub = sub i64 %p2.addr.ext, %p.addr.ext + ret i64 %sub +} + +define i128 @sub_zext_ptrtoint_ptrtoaddr_addrsize(ptr addrspace(1) %p, i32 %offset) { +; CHECK-LABEL: define i128 @sub_zext_ptrtoint_ptrtoaddr_addrsize( +; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[OFFSET:%.*]]) { +; CHECK-NEXT: [[P2:%.*]] = getelementptr nuw i8, ptr addrspace(1) [[P]], i32 [[OFFSET]] +; CHECK-NEXT: [[P_INT:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 +; CHECK-NEXT: [[P2_ADDR:%.*]] = ptrtoaddr ptr addrspace(1) [[P2]] to i32 +; CHECK-NEXT: [[P_INT_EXT:%.*]] = zext i64 [[P_INT]] to i128 +; CHECK-NEXT: [[P2_ADDR_EXT:%.*]] = zext i32 [[P2_ADDR]] to i128 +; CHECK-NEXT: [[SUB:%.*]] = sub nsw i128 [[P2_ADDR_EXT]], [[P_INT_EXT]] +; CHECK-NEXT: ret i128 [[SUB]] +; + %p2 = getelementptr nuw i8, ptr addrspace(1) %p, i32 %offset + %p.int = ptrtoint ptr addrspace(1) %p to i64 + %p2.addr = ptrtoaddr ptr addrspace(1) %p2 to i32 + %p.int.ext = zext i64 %p.int to i128 + %p2.addr.ext = zext i32 %p2.addr to i128 + %sub = sub i128 %p2.addr.ext, %p.int.ext + ret i128 %sub +} diff --git a/llvm/test/Transforms/InstCombine/select_frexp.ll b/llvm/test/Transforms/InstCombine/select_frexp.ll index d025aed..ccfcca1 100644 --- a/llvm/test/Transforms/InstCombine/select_frexp.ll +++ b/llvm/test/Transforms/InstCombine/select_frexp.ll @@ -115,10 +115,10 @@ define float @test_select_frexp_no_const(float %x, float %y, i1 %cond) { define i32 @test_select_frexp_extract_exp(float %x, i1 %cond) { ; CHECK-LABEL: define i32 @test_select_frexp_extract_exp( ; CHECK-SAME: float [[X:%.*]], i1 [[COND:%.*]]) { -; CHECK-NEXT: [[SEL:%.*]] = select i1 [[COND]], float 1.000000e+00, float [[X]] -; CHECK-NEXT: [[FREXP:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[SEL]]) +; CHECK-NEXT: [[FREXP:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[X]]) ; CHECK-NEXT: [[FREXP_1:%.*]] = extractvalue { float, i32 } [[FREXP]], 1 -; CHECK-NEXT: ret i32 [[FREXP_1]] +; CHECK-NEXT: [[FREXP_2:%.*]] = select i1 [[COND]], i32 1, i32 [[FREXP_1]] +; CHECK-NEXT: ret i32 [[FREXP_2]] ; %sel = select i1 %cond, float 1.000000e+00, float %x %frexp = call { float, i32 } @llvm.frexp.f32.i32(float %sel) @@ -132,7 +132,7 @@ define float @test_select_frexp_fast_math_select(float %x, i1 %cond) { ; CHECK-SAME: float [[X:%.*]], i1 [[COND:%.*]]) { ; CHECK-NEXT: [[FREXP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[X]]) ; CHECK-NEXT: [[MANTISSA:%.*]] = extractvalue { float, i32 } [[FREXP1]], 0 -; CHECK-NEXT: [[SELECT_FREXP:%.*]] = select nnan ninf nsz i1 [[COND]], float 5.000000e-01, float [[MANTISSA]] +; CHECK-NEXT: [[SELECT_FREXP:%.*]] = select i1 [[COND]], float 5.000000e-01, float [[MANTISSA]] ; CHECK-NEXT: ret float [[SELECT_FREXP]] ; %sel = select nnan ninf nsz i1 %cond, float 1.000000e+00, float %x diff --git a/llvm/test/Transforms/InstCombine/select_with_identical_phi.ll b/llvm/test/Transforms/InstCombine/select_with_identical_phi.ll deleted file mode 100644 index 7816781..0000000 --- a/llvm/test/Transforms/InstCombine/select_with_identical_phi.ll +++ /dev/null @@ -1,243 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -S -passes=instcombine | FileCheck %s -@A = extern_weak global float, align 4 - -; %same.as.v1 is a select with two phis %v1 and %phi.to.remove as the true -; and false values, while %v1 and %phi.to.remove are actually the same. -; Fold the selection instruction %same.as.v1 to %v1. -define void @select_with_identical_phi(ptr %m, ptr %n, i32 %count) { -; CHECK-LABEL: @select_with_identical_phi( -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[V0:%.*]] = phi float [ 0x4415AF1D80000000, [[ENTRY:%.*]] ], [ [[V0_1:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[Q:%.*]] = phi ptr [ [[M:%.*]], [[ENTRY]] ], [ [[Q_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[C:%.*]] = phi ptr [ [[N:%.*]], [[ENTRY]] ], [ [[C_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[Q_LOAD:%.*]] = load float, ptr [[Q]], align 4 -; CHECK-NEXT: [[C_LOAD:%.*]] = load float, ptr [[C]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fsub float [[Q_LOAD]], [[C_LOAD]] -; CHECK-NEXT: [[CMP1:%.*]] = fcmp olt float [[SUB]], [[V0]] -; CHECK-NEXT: [[V0_1]] = select i1 [[CMP1]], float [[SUB]], float [[V0]] -; CHECK-NEXT: [[CMP2:%.*]] = fcmp ogt float [[SUB]], [[V1]] -; CHECK-NEXT: [[V1_1]] = select i1 [[CMP2]], float [[SUB]], float [[V1]] -; CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I]], 1 -; CHECK-NEXT: [[Q_NEXT]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 4 -; CHECK-NEXT: [[C_NEXT]] = getelementptr inbounds nuw i8, ptr [[C]], i64 4 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], [[COUNT:%.*]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]] -; CHECK: exit: -; CHECK-NEXT: store float [[V1_1]], ptr @A, align 4 -; CHECK-NEXT: ret void -; -entry: - br label %for.body - -for.body: ; preds = %entry, %for.body - %v0 = phi float [ 0x4415AF1D80000000, %entry ], [ %v0.1, %for.body ] - %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ] - %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ] - %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ] - %q = phi ptr [ %m, %entry ], [ %q.next, %for.body ] - %c = phi ptr [ %n, %entry ], [ %c.next, %for.body ] - %q.load = load float, ptr %q - %c.load = load float, ptr %c - %sub = fsub float %q.load, %c.load - %cmp1 = fcmp olt float %sub, %v0 - %v0.1 = select i1 %cmp1, float %sub, float %v0 - %same.as.v1 = select i1 %cmp1, float %v1, float %phi.to.remove - %cmp2 = fcmp ogt float %sub, %same.as.v1 - %v1.1 = select i1 %cmp2, float %sub, float %v1 - %phi.to.remove.next = select i1 %cmp2, float %sub, float %same.as.v1 - %inc.i = add nuw nsw i32 %i, 1 - %q.next = getelementptr inbounds i8, ptr %q, i64 4 - %c.next = getelementptr inbounds i8, ptr %c, i64 4 - %exitcond = icmp eq i32 %inc.i, %count - br i1 %exitcond, label %exit, label %for.body - -exit: - %vl.1.lcssa = phi float [ %v1.1, %for.body ] - store float %vl.1.lcssa, ptr @A - ret void -} - -; The difference from select_with_identical_phi() is that the true and false values in -; %phi.to.remove.next and %v1.1 are swapped. -; Check that %same.as.v1 can be folded. -define void @select_with_identical_phi_2(ptr %m, ptr %n, i32 %count) { -; CHECK-LABEL: @select_with_identical_phi_2( -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[V0:%.*]] = phi float [ 0x4415AF1D80000000, [[ENTRY:%.*]] ], [ [[V0_1:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[Q:%.*]] = phi ptr [ [[M:%.*]], [[ENTRY]] ], [ [[Q_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[C:%.*]] = phi ptr [ [[N:%.*]], [[ENTRY]] ], [ [[C_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[Q_LOAD:%.*]] = load float, ptr [[Q]], align 4 -; CHECK-NEXT: [[C_LOAD:%.*]] = load float, ptr [[C]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fsub float [[Q_LOAD]], [[C_LOAD]] -; CHECK-NEXT: [[CMP1:%.*]] = fcmp olt float [[SUB]], [[V0]] -; CHECK-NEXT: [[V0_1]] = select i1 [[CMP1]], float [[SUB]], float [[V0]] -; CHECK-NEXT: [[CMP2:%.*]] = fcmp ogt float [[SUB]], [[V1]] -; CHECK-NEXT: [[V1_1]] = select i1 [[CMP2]], float [[V1]], float [[SUB]] -; CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I]], 1 -; CHECK-NEXT: [[Q_NEXT]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 4 -; CHECK-NEXT: [[C_NEXT]] = getelementptr inbounds nuw i8, ptr [[C]], i64 4 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], [[COUNT:%.*]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]] -; CHECK: exit: -; CHECK-NEXT: store float [[V1_1]], ptr @A, align 4 -; CHECK-NEXT: ret void -; -entry: - br label %for.body - -for.body: ; preds = %entry, %for.body - %v0 = phi float [ 0x4415AF1D80000000, %entry ], [ %v0.1, %for.body ] - %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ] - %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ] - %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ] - %q = phi ptr [ %m, %entry ], [ %q.next, %for.body ] - %c = phi ptr [ %n, %entry ], [ %c.next, %for.body ] - %q.load = load float, ptr %q - %c.load = load float, ptr %c - %sub = fsub float %q.load, %c.load - %cmp1 = fcmp olt float %sub, %v0 - %v0.1 = select i1 %cmp1, float %sub, float %v0 - %same.as.v1 = select i1 %cmp1, float %v1, float %phi.to.remove - %cmp2 = fcmp ogt float %sub, %same.as.v1 - %v1.1 = select i1 %cmp2, float %v1, float %sub - %phi.to.remove.next = select i1 %cmp2, float %same.as.v1, float %sub - %inc.i = add nuw nsw i32 %i, 1 - %q.next = getelementptr inbounds i8, ptr %q, i64 4 - %c.next = getelementptr inbounds i8, ptr %c, i64 4 - %exitcond = icmp eq i32 %inc.i, %count - br i1 %exitcond, label %exit, label %for.body - -exit: - %vl.1.lcssa = phi float [ %v1.1, %for.body ] - store float %vl.1.lcssa, ptr @A - ret void -} - -; The difference from select_with_identical_phi() is that the true and false values in -; same.as.v1 are swapped. -; Check that %same.as.v1 can be folded. -define void @select_with_identical_phi_3(ptr %m, ptr %n, i32 %count) { -; CHECK-LABEL: @select_with_identical_phi_3( -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[V0:%.*]] = phi float [ 0x4415AF1D80000000, [[ENTRY:%.*]] ], [ [[V0_1:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[Q:%.*]] = phi ptr [ [[M:%.*]], [[ENTRY]] ], [ [[Q_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[C:%.*]] = phi ptr [ [[N:%.*]], [[ENTRY]] ], [ [[C_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[Q_LOAD:%.*]] = load float, ptr [[Q]], align 4 -; CHECK-NEXT: [[C_LOAD:%.*]] = load float, ptr [[C]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fsub float [[Q_LOAD]], [[C_LOAD]] -; CHECK-NEXT: [[CMP1:%.*]] = fcmp olt float [[SUB]], [[V0]] -; CHECK-NEXT: [[V0_1]] = select i1 [[CMP1]], float [[SUB]], float [[V0]] -; CHECK-NEXT: [[CMP2:%.*]] = fcmp ogt float [[SUB]], [[V1]] -; CHECK-NEXT: [[V1_1]] = select i1 [[CMP2]], float [[SUB]], float [[V1]] -; CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I]], 1 -; CHECK-NEXT: [[Q_NEXT]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 4 -; CHECK-NEXT: [[C_NEXT]] = getelementptr inbounds nuw i8, ptr [[C]], i64 4 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], [[COUNT:%.*]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]] -; CHECK: exit: -; CHECK-NEXT: store float [[V1_1]], ptr @A, align 4 -; CHECK-NEXT: ret void -; -entry: - br label %for.body - -for.body: ; preds = %entry, %for.body - %v0 = phi float [ 0x4415AF1D80000000, %entry ], [ %v0.1, %for.body ] - %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ] - %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ] - %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ] - %q = phi ptr [ %m, %entry ], [ %q.next, %for.body ] - %c = phi ptr [ %n, %entry ], [ %c.next, %for.body ] - %q.load = load float, ptr %q - %c.load = load float, ptr %c - %sub = fsub float %q.load, %c.load - %cmp1 = fcmp olt float %sub, %v0 - %v0.1 = select i1 %cmp1, float %sub, float %v0 - %same.as.v1 = select i1 %cmp1, float %phi.to.remove, float %v1 - %cmp2 = fcmp ogt float %sub, %same.as.v1 - %v1.1 = select i1 %cmp2, float %sub, float %v1 - %phi.to.remove.next = select i1 %cmp2, float %sub, float %same.as.v1 - %inc.i = add nuw nsw i32 %i, 1 - %q.next = getelementptr inbounds i8, ptr %q, i64 4 - %c.next = getelementptr inbounds i8, ptr %c, i64 4 - %exitcond = icmp eq i32 %inc.i, %count - br i1 %exitcond, label %exit, label %for.body - -exit: - %vl.1.lcssa = phi float [ %v1.1, %for.body ] - store float %vl.1.lcssa, ptr @A - ret void -} - -; The difference from select_with_identical_phi() is that the true and false values in -; %same.as.v1, %phi.to.remove.next and %v1.1 are swapped. -; Check that %same.as.v1 can be folded. -define void @select_with_identical_phi_4(ptr %m, ptr %n, i32 %count) { -; CHECK-LABEL: @select_with_identical_phi_4( -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[V0:%.*]] = phi float [ 0x4415AF1D80000000, [[ENTRY:%.*]] ], [ [[V0_1:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[Q:%.*]] = phi ptr [ [[M:%.*]], [[ENTRY]] ], [ [[Q_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[C:%.*]] = phi ptr [ [[N:%.*]], [[ENTRY]] ], [ [[C_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[Q_LOAD:%.*]] = load float, ptr [[Q]], align 4 -; CHECK-NEXT: [[C_LOAD:%.*]] = load float, ptr [[C]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fsub float [[Q_LOAD]], [[C_LOAD]] -; CHECK-NEXT: [[CMP1:%.*]] = fcmp olt float [[SUB]], [[V0]] -; CHECK-NEXT: [[V0_1]] = select i1 [[CMP1]], float [[SUB]], float [[V0]] -; CHECK-NEXT: [[CMP2:%.*]] = fcmp ogt float [[SUB]], [[V1]] -; CHECK-NEXT: [[V1_1]] = select i1 [[CMP2]], float [[V1]], float [[SUB]] -; CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I]], 1 -; CHECK-NEXT: [[Q_NEXT]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 4 -; CHECK-NEXT: [[C_NEXT]] = getelementptr inbounds nuw i8, ptr [[C]], i64 4 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], [[COUNT:%.*]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]] -; CHECK: exit: -; CHECK-NEXT: store float [[V1_1]], ptr @A, align 4 -; CHECK-NEXT: ret void -; -entry: - br label %for.body - -for.body: ; preds = %entry, %for.body - %v0 = phi float [ 0x4415AF1D80000000, %entry ], [ %v0.1, %for.body ] - %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ] - %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ] - %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ] - %q = phi ptr [ %m, %entry ], [ %q.next, %for.body ] - %c = phi ptr [ %n, %entry ], [ %c.next, %for.body ] - %q.load = load float, ptr %q - %c.load = load float, ptr %c - %sub = fsub float %q.load, %c.load - %cmp1 = fcmp olt float %sub, %v0 - %v0.1 = select i1 %cmp1, float %sub, float %v0 - %same.as.v1 = select i1 %cmp1, float %phi.to.remove, float %v1 - %cmp2 = fcmp ogt float %sub, %same.as.v1 - %v1.1 = select i1 %cmp2, float %v1, float %sub - %phi.to.remove.next = select i1 %cmp2, float %same.as.v1, float %sub - %inc.i = add nuw nsw i32 %i, 1 - %q.next = getelementptr inbounds i8, ptr %q, i64 4 - %c.next = getelementptr inbounds i8, ptr %c, i64 4 - %exitcond = icmp eq i32 %inc.i, %count - br i1 %exitcond, label %exit, label %for.body - -exit: - %vl.1.lcssa = phi float [ %v1.1, %for.body ] - store float %vl.1.lcssa, ptr @A - ret void -} diff --git a/llvm/test/Transforms/InstCombine/sub-gep.ll b/llvm/test/Transforms/InstCombine/sub-gep.ll index 8eeaea1..ee70137 100644 --- a/llvm/test/Transforms/InstCombine/sub-gep.ll +++ b/llvm/test/Transforms/InstCombine/sub-gep.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -passes=instcombine < %s | FileCheck %s -target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-p2:32:32" +target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-p2:32:32-p3:32:32:32:16" define i64 @test_inbounds(ptr %base, i64 %idx) { ; CHECK-LABEL: @test_inbounds( @@ -505,6 +505,23 @@ define i64 @negative_zext_ptrtoint_sub_ptrtoint_as2_nuw(i32 %offset) { ret i64 %D } +define i64 @negative_zext_ptrtoint_sub_zext_ptrtoint_as2_nuw_truncating(i32 %offset) { +; CHECK-LABEL: @negative_zext_ptrtoint_sub_zext_ptrtoint_as2_nuw_truncating( +; CHECK-NEXT: [[A:%.*]] = getelementptr nuw bfloat, ptr addrspace(2) @Arr_as2, i32 [[OFFSET:%.*]] +; CHECK-NEXT: [[A_IDX:%.*]] = ptrtoint ptr addrspace(2) [[A]] to i32 +; CHECK-NEXT: [[E:%.*]] = zext i32 [[A_IDX]] to i64 +; CHECK-NEXT: [[D:%.*]] = zext i16 ptrtoint (ptr addrspace(2) @Arr_as2 to i16) to i64 +; CHECK-NEXT: [[E1:%.*]] = sub nsw i64 [[E]], [[D]] +; CHECK-NEXT: ret i64 [[E1]] +; + %A = getelementptr nuw bfloat, ptr addrspace(2) @Arr_as2, i32 %offset + %B = ptrtoint ptr addrspace(2) %A to i32 + %C = zext i32 %B to i64 + %D = zext i16 ptrtoint (ptr addrspace(2) @Arr_as2 to i16) to i64 + %E = sub i64 %C, %D + ret i64 %E +} + define i64 @ptrtoint_sub_zext_ptrtoint_as2_inbounds_local(ptr addrspace(2) %p, i32 %offset) { ; CHECK-LABEL: @ptrtoint_sub_zext_ptrtoint_as2_inbounds_local( ; CHECK-NEXT: [[A:%.*]] = getelementptr inbounds bfloat, ptr addrspace(2) [[P:%.*]], i32 [[OFFSET:%.*]] @@ -614,6 +631,20 @@ define i64 @negative_zext_ptrtoint_sub_ptrtoint_as2_nuw_local(ptr addrspace(2) % ret i64 %D } +define i64 @zext_ptrtoint_sub_ptrtoint_as3_nuw_local(ptr addrspace(3) %p, i16 %offset) { +; CHECK-LABEL: @zext_ptrtoint_sub_ptrtoint_as3_nuw_local( +; CHECK-NEXT: [[SUB:%.*]] = zext i16 [[GEP_IDX:%.*]] to i64 +; CHECK-NEXT: ret i64 [[SUB]] +; + %gep = getelementptr nuw i8, ptr addrspace(3) %p, i16 %offset + %gep.int = ptrtoint ptr addrspace(3) %gep to i32 + %p.int = ptrtoint ptr addrspace(3) %p to i32 + %gep.int.ext = zext i32 %gep.int to i64 + %p.int.ext = zext i32 %p.int to i64 + %sub = sub i64 %gep.int.ext, %p.int.ext + ret i64 %sub +} + define i64 @test30(ptr %foo, i64 %i, i64 %j) { ; CHECK-LABEL: @test30( ; CHECK-NEXT: [[GEP1_IDX:%.*]] = shl nsw i64 [[I:%.*]], 2 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll index 11cc971..fb7890a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll @@ -417,21 +417,17 @@ for.end: ; preds = %for.body, %entry ; Note: This test was added to ensure we always check the legality of reductions (end emit a warning if necessary) before checking for memory dependencies ; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. -; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2) +; CHECK-REMARK: Ignoring user-specified interleave count due to possibly unsafe dependencies in the loop. +; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 1) define i32 @memory_dependence(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i64 %n) { ; CHECK-LABEL: @memory_dependence ; CHECK: vector.body: ; CHECK: %[[LOAD1:.*]] = load <4 x i32> ; CHECK: %[[LOAD2:.*]] = load <4 x i32> -; CHECK: %[[LOAD3:.*]] = load <4 x i32> -; CHECK: %[[LOAD4:.*]] = load <4 x i32> -; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD3]], %[[LOAD1]] -; CHECK: %[[ADD2:.*]] = add nsw <4 x i32> %[[LOAD4]], %[[LOAD2]] -; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD3]] -; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD4]] +; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD2]], %[[LOAD1]] +; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD2]] ; CHECK: middle.block: -; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]] -; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]]) +; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[MUL1]]) entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll index ae8dc2d..005ca8c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll @@ -175,18 +175,28 @@ define void @test_add_double_same_var_args_1(ptr %res, ptr noalias %A, ptr noali ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP0]] -; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP1]], align 4 -; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = load <2 x double>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2> +; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3> +; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2> +; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3> +; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[STRIDED_VEC3]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[STRIDED_VEC1]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[STRIDED_VEC4]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]] -; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[TMP7]], align 4 -; CHECK-NEXT: store <2 x double> [[TMP6]], ptr [[TMP8]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP8]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 ; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: @@ -227,18 +237,28 @@ define void @test_add_double_same_var_args_2(ptr %res, ptr noalias %A, ptr noali ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP0]] -; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP1]], align 4 -; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = load <2 x double>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2> +; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3> +; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2> +; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3> +; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC]] +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC3]] ; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC1]] ; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC4]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]] -; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[TMP7]], align 4 -; CHECK-NEXT: store <2 x double> [[TMP6]], ptr [[TMP8]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP8]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 ; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll index b23702d..2a19402 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll @@ -319,46 +319,46 @@ define void @single_fmul_used_by_each_member(ptr noalias %A, ptr noalias %B, ptr ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], 6 ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr double, ptr [[A]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP33:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP21]] -; CHECK-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP20]] -; CHECK-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP22]] -; CHECK-NEXT: [[TMP24:%.*]] = load double, ptr [[TMP23]], align 8 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x double> poison, double [[TMP24]], i64 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT1]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP25:%.*]] = load double, ptr [[TMP33]], align 8 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <2 x double> poison, double [[TMP25]], i64 0 -; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT12]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP26:%.*]] = load double, ptr [[TMP37]], align 8 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <2 x double> poison, double [[TMP26]], i64 0 -; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT14]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP27:%.*]] = load double, ptr [[TMP39]], align 8 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <2 x double> poison, double [[TMP27]], i64 0 -; CHECK-NEXT: [[WIDE_LOAD14:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT16]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr double, ptr [[TMP23]], i32 2 +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[TMP23]], i32 4 +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr double, ptr [[TMP23]], i32 6 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP23]], align 8 +; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <2 x double>, ptr [[TMP25]], align 8 +; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = load <2 x double>, ptr [[TMP26]], align 8 +; CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <2 x double>, ptr [[TMP27]], align 8 ; CHECK-NEXT: [[TMP28:%.*]] = fmul <2 x double> [[WIDE_LOAD]], splat (double 5.000000e+00) ; CHECK-NEXT: [[TMP29:%.*]] = fmul <2 x double> [[WIDE_LOAD12]], splat (double 5.000000e+00) ; CHECK-NEXT: [[TMP30:%.*]] = fmul <2 x double> [[WIDE_LOAD13]], splat (double 5.000000e+00) ; CHECK-NEXT: [[TMP31:%.*]] = fmul <2 x double> [[WIDE_LOAD14]], splat (double 5.000000e+00) ; CHECK-NEXT: [[TMP32:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP20]] ; CHECK-NEXT: [[TMP34:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP21]] -; CHECK-NEXT: [[TMP38:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP20]] ; CHECK-NEXT: [[TMP35:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP22]] -; CHECK-NEXT: store <2 x double> [[TMP28]], ptr [[TMP32]], align 8 -; CHECK-NEXT: store <2 x double> [[TMP29]], ptr [[TMP34]], align 8 -; CHECK-NEXT: store <2 x double> [[TMP30]], ptr [[TMP38]], align 8 -; CHECK-NEXT: store <2 x double> [[TMP31]], ptr [[TMP35]], align 8 +; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <2 x double> [[TMP28]], <2 x double> [[TMP28]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP36]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP32]], align 8 +; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <2 x double> [[TMP29]], <2 x double> [[TMP29]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[INTERLEAVED_VEC15:%.*]] = shufflevector <4 x double> [[TMP37]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC15]], ptr [[TMP33]], align 8 +; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <2 x double> [[TMP30]], <2 x double> [[TMP30]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[INTERLEAVED_VEC16:%.*]] = shufflevector <4 x double> [[TMP38]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC16]], ptr [[TMP34]], align 8 +; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <2 x double> [[TMP31]], <2 x double> [[TMP31]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[INTERLEAVED_VEC17:%.*]] = shufflevector <4 x double> [[TMP39]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC17]], ptr [[TMP35]], align 8 ; CHECK-NEXT: [[TMP40:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP42:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP21]] ; CHECK-NEXT: [[TMP41:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP42:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP21]] ; CHECK-NEXT: [[TMP43:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP22]] -; CHECK-NEXT: store <2 x double> [[TMP28]], ptr [[TMP40]], align 8 -; CHECK-NEXT: store <2 x double> [[TMP29]], ptr [[TMP42]], align 8 -; CHECK-NEXT: store <2 x double> [[TMP30]], ptr [[TMP41]], align 8 -; CHECK-NEXT: store <2 x double> [[TMP31]], ptr [[TMP43]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP40]], align 8 +; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC15]], ptr [[TMP41]], align 8 +; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC16]], ptr [[TMP42]], align 8 +; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC17]], ptr [[TMP43]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP44]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: @@ -435,7 +435,7 @@ exit: ret void } -; We should interleave by 2 after narrowing interleave groups to saturate +; FIXME: We should interleave by 2 after narrowing interleave groups to saturate ; load/store units. define void @test_interleave_after_narrowing(i32 %n, ptr %x, ptr noalias %y) { ; CHECK-LABEL: define void @test_interleave_after_narrowing( @@ -447,18 +447,12 @@ define void @test_interleave_after_narrowing(i32 %n, ptr %x, ptr noalias %y) { ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 4 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[TMP5]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP7]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = fneg <4 x float> [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP4:%.*]] = fneg <4 x float> [[WIDE_LOAD1]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[TMP5]] ; CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[TMP2]], align 4 -; CHECK-NEXT: store <4 x float> [[TMP4]], ptr [[TMP6]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 ; CHECK-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll index 2865495..c261760 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll @@ -13,8 +13,12 @@ define void @load_store_interleave_group_tc_2(ptr noalias %data) { ; VF2: [[VECTOR_PH]]: ; VF2-NEXT: br label %[[VECTOR_BODY:.*]] ; VF2: [[VECTOR_BODY]]: -; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[DATA]], align 8 -; VF2-NEXT: store <2 x i64> [[WIDE_LOAD]], ptr [[DATA]], align 8 +; VF2-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[DATA]], align 8 +; VF2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2> +; VF2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3> +; VF2-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[DATA]], align 8 ; VF2-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; VF2: [[MIDDLE_BLOCK]]: ; VF2-NEXT: br label %[[EXIT:.*]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll index 829acbbf..b63e03d 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll @@ -1,81 +1,29 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 5 -; RUN: opt -p loop-vectorize -force-vector-interleave=1 -S -mcpu=neoverse-512tvb %s | FileCheck --check-prefixes=IC1 %s -; RUN: opt -p loop-vectorize -S -mcpu=neoverse-512tvb %s | FileCheck --check-prefixes=CHECK %s +; RUN: opt -p loop-vectorize -force-vector-interleave=1 -S -mcpu=neoverse-512tvb %s | FileCheck --check-prefixes=CHECK %s target triple = "aarch64-unknown-linux" define void @load_store_interleave_group(ptr noalias %data) { -; IC1-LABEL: define void @load_store_interleave_group( -; IC1-SAME: ptr noalias [[DATA:%.*]]) #[[ATTR0:[0-9]+]] { -; IC1-NEXT: [[ENTRY:.*:]] -; IC1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; IC1-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1 -; IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP1]] -; IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; IC1: [[VECTOR_PH]]: -; IC1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; IC1-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 100, [[TMP3]] -; IC1-NEXT: [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]] -; IC1-NEXT: br label %[[VECTOR_BODY:.*]] -; IC1: [[VECTOR_BODY]]: -; IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; IC1-NEXT: [[TMP4:%.*]] = shl nsw i64 [[INDEX]], 1 -; IC1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]] -; IC1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP5]], align 8 -; IC1-NEXT: store <vscale x 2 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 8 -; IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]] -; IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; IC1: [[MIDDLE_BLOCK]]: -; IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 100, [[N_VEC]] -; IC1-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] -; IC1: [[SCALAR_PH]]: -; ; CHECK-LABEL: define void @load_store_interleave_group( ; CHECK-SAME: ptr noalias [[DATA:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 3 +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP5]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 100, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]] -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP2]], 4 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP20]], 1 -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP2]], 2 -; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP24]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 1 -; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP2]], 3 -; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP12]], 0 -; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 1 -; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], [[TMP14]] ; CHECK-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP17:%.*]] = shl nsw i64 [[TMP7]], 1 -; CHECK-NEXT: [[TMP18:%.*]] = shl nsw i64 [[TMP11]], 1 -; CHECK-NEXT: [[TMP19:%.*]] = shl nsw i64 [[TMP15]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP17]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP18]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP19]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP1]], align 8 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP21]], align 8 -; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i64>, ptr [[TMP22]], align 8 -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i64>, ptr [[TMP23]], align 8 ; CHECK-NEXT: store <vscale x 2 x i64> [[WIDE_LOAD]], ptr [[TMP1]], align 8 -; CHECK-NEXT: store <vscale x 2 x i64> [[WIDE_LOAD1]], ptr [[TMP21]], align 8 -; CHECK-NEXT: store <vscale x 2 x i64> [[WIDE_LOAD2]], ptr [[TMP22]], align 8 -; CHECK-NEXT: store <vscale x 2 x i64> [[WIDE_LOAD3]], ptr [[TMP23]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: @@ -105,82 +53,27 @@ exit: } define void @test_2xi64_unary_op_load_interleave_group(ptr noalias %data, ptr noalias %factor) { -; IC1-LABEL: define void @test_2xi64_unary_op_load_interleave_group( -; IC1-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) #[[ATTR0]] { -; IC1-NEXT: [[ENTRY:.*:]] -; IC1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; IC1-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1 -; IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1111, [[TMP1]] -; IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; IC1: [[VECTOR_PH]]: -; IC1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; IC1-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1111, [[TMP3]] -; IC1-NEXT: [[N_VEC:%.*]] = sub i64 1111, [[N_MOD_VF]] -; IC1-NEXT: br label %[[VECTOR_BODY:.*]] -; IC1: [[VECTOR_BODY]]: -; IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; IC1-NEXT: [[TMP4:%.*]] = shl nsw i64 [[INDEX]], 1 -; IC1-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP4]] -; IC1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr [[TMP5]], align 8 -; IC1-NEXT: [[TMP6:%.*]] = fneg <vscale x 2 x double> [[WIDE_LOAD]] -; IC1-NEXT: store <vscale x 2 x double> [[TMP6]], ptr [[TMP5]], align 8 -; IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]] -; IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IC1-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] -; IC1: [[MIDDLE_BLOCK]]: -; IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1111, [[N_VEC]] -; IC1-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] -; IC1: [[SCALAR_PH]]: -; ; CHECK-LABEL: define void @test_2xi64_unary_op_load_interleave_group( ; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 3 +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1111, [[TMP5]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1111, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1111, [[N_MOD_VF]] -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP2]], 4 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP20]], 1 -; CHECK-NEXT: [[TMP24:%.*]] = add i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP2]], 2 -; CHECK-NEXT: [[TMP28:%.*]] = add i64 [[TMP8]], 0 -; CHECK-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 1 -; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP29]] -; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP2]], 3 -; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP12]], 0 -; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 1 -; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], [[TMP14]] ; CHECK-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP17:%.*]] = shl nsw i64 [[TMP24]], 1 -; CHECK-NEXT: [[TMP18:%.*]] = shl nsw i64 [[TMP11]], 1 -; CHECK-NEXT: [[TMP19:%.*]] = shl nsw i64 [[TMP15]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP17]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP18]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP19]] ; CHECK-NEXT: [[TMP7:%.*]] = load <vscale x 2 x double>, ptr [[TMP1]], align 8 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 2 x double>, ptr [[TMP21]], align 8 -; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 2 x double>, ptr [[TMP22]], align 8 -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 2 x double>, ptr [[TMP23]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = fneg <vscale x 2 x double> [[TMP7]] -; CHECK-NEXT: [[TMP25:%.*]] = fneg <vscale x 2 x double> [[WIDE_LOAD1]] -; CHECK-NEXT: [[TMP26:%.*]] = fneg <vscale x 2 x double> [[WIDE_LOAD2]] -; CHECK-NEXT: [[TMP27:%.*]] = fneg <vscale x 2 x double> [[WIDE_LOAD3]] ; CHECK-NEXT: store <vscale x 2 x double> [[TMP9]], ptr [[TMP1]], align 8 -; CHECK-NEXT: store <vscale x 2 x double> [[TMP25]], ptr [[TMP21]], align 8 -; CHECK-NEXT: store <vscale x 2 x double> [[TMP26]], ptr [[TMP22]], align 8 -; CHECK-NEXT: store <vscale x 2 x double> [[TMP27]], ptr [[TMP23]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]] ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: @@ -210,3 +103,175 @@ loop: exit: ret void } + +define void @test_masked_interleave_group(i32 %N, ptr %mask, ptr %src, ptr %dst) { +; IC1-LABEL: define void @test_masked_interleave_group( +; IC1-SAME: i32 [[N:%.*]], ptr [[MASK:%.*]], ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] { +; IC1-NEXT: [[ENTRY:.*:]] +; IC1-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; IC1-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1 +; IC1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; IC1-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2 +; IC1-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP3]], i64 8) +; IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[UMAX]] +; IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; IC1: [[VECTOR_MEMCHECK]]: +; IC1-NEXT: [[TMP4:%.*]] = zext i32 [[N]] to i64 +; IC1-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4 +; IC1-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 16 +; IC1-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]] +; IC1-NEXT: [[TMP7:%.*]] = add nuw nsw i64 [[TMP4]], 1 +; IC1-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[TMP7]] +; IC1-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP6]] +; IC1-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; IC1-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[MASK]], [[SCEVGEP]] +; IC1-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; IC1-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; IC1-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; IC1-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; IC1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; IC1-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; IC1: [[VECTOR_PH]]: +; IC1-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; IC1-NEXT: [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4 +; IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP9]] +; IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] +; IC1-NEXT: [[TMP10:%.*]] = trunc i64 [[N_VEC]] to i32 +; IC1-NEXT: [[TMP11:%.*]] = mul i64 [[N_VEC]], 16 +; IC1-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]] +; IC1-NEXT: [[TMP13:%.*]] = mul i64 [[N_VEC]], 16 +; IC1-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]] +; IC1-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[N_VEC]] +; IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; IC1: [[VECTOR_BODY]]: +; IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC1-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16 +; IC1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]] +; IC1-NEXT: [[OFFSET_IDX6:%.*]] = mul i64 [[INDEX]], 16 +; IC1-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX6]] +; IC1-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[INDEX]] +; IC1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[NEXT_GEP8]], align 1, !alias.scope [[META6:![0-9]+]] +; IC1-NEXT: [[TMP16:%.*]] = icmp eq <vscale x 4 x i8> [[WIDE_LOAD]], zeroinitializer +; IC1-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]]) +; IC1-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 16 x float> @llvm.masked.load.nxv16f32.p0(ptr align 4 [[NEXT_GEP7]], <vscale x 16 x i1> [[INTERLEAVED_MASK]], <vscale x 16 x float> poison), !alias.scope [[META9:![0-9]+]] +; IC1-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave4.nxv16f32(<vscale x 16 x float> [[WIDE_MASKED_VEC]]) +; IC1-NEXT: [[TMP17:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 0 +; IC1-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 1 +; IC1-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 2 +; IC1-NEXT: [[TMP20:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 3 +; IC1-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 16 x float> @llvm.vector.interleave4.nxv16f32(<vscale x 4 x float> [[TMP17]], <vscale x 4 x float> [[TMP18]], <vscale x 4 x float> [[TMP19]], <vscale x 4 x float> [[TMP20]]) +; IC1-NEXT: [[INTERLEAVED_MASK9:%.*]] = call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]]) +; IC1-NEXT: call void @llvm.masked.store.nxv16f32.p0(<vscale x 16 x float> [[INTERLEAVED_VEC]], ptr align 4 [[NEXT_GEP]], <vscale x 16 x i1> [[INTERLEAVED_MASK9]]), !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]] +; IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; IC1-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IC1-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; IC1: [[MIDDLE_BLOCK]]: +; IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] +; IC1-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; IC1: [[SCALAR_PH]]: +; +; CHECK-LABEL: define void @test_masked_interleave_group( +; CHECK-SAME: i32 [[N:%.*]], ptr [[MASK:%.*]], ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2 +; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP3]], i64 8) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[UMAX]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 16 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = add nuw nsw i64 [[TMP4]], 1 +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[TMP7]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP6]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[MASK]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP9]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[N_VEC]], 16 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[N_VEC]], 16 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[N_VEC]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[OFFSET_IDX6:%.*]] = mul i64 [[INDEX]], 16 +; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX6]] +; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[NEXT_GEP8]], align 1, !alias.scope [[META6:![0-9]+]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq <vscale x 4 x i8> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]]) +; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 16 x float> @llvm.masked.load.nxv16f32.p0(ptr align 4 [[NEXT_GEP7]], <vscale x 16 x i1> [[INTERLEAVED_MASK]], <vscale x 16 x float> poison), !alias.scope [[META9:![0-9]+]] +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave4.nxv16f32(<vscale x 16 x float> [[WIDE_MASKED_VEC]]) +; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 2 +; CHECK-NEXT: [[TMP20:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 3 +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 16 x float> @llvm.vector.interleave4.nxv16f32(<vscale x 4 x float> [[TMP17]], <vscale x 4 x float> [[TMP18]], <vscale x 4 x float> [[TMP19]], <vscale x 4 x float> [[TMP20]]) +; CHECK-NEXT: [[INTERLEAVED_MASK9:%.*]] = call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv16f32.p0(<vscale x 16 x float> [[INTERLEAVED_VEC]], ptr align 4 [[NEXT_GEP]], <vscale x 16 x i1> [[INTERLEAVED_MASK9]]), !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop.header + +loop.header: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %dst.iv = phi ptr [ %dst, %entry ], [ %dst.iv.next, %loop.latch ] + %src.iv = phi ptr [ %src, %entry ], [ %src.iv.next, %loop.latch ] + %mask.iv = phi ptr [ %mask, %entry ], [ %mask.iv.next, %loop.latch ] + %mask.iv.next = getelementptr i8, ptr %mask.iv, i64 1 + %mask.val = load i8, ptr %mask.iv, align 1 + %should.copy = icmp eq i8 %mask.val, 0 + br i1 %should.copy, label %then, label %loop.latch + +then: + %elem0 = load float, ptr %src.iv, align 4 + store float %elem0, ptr %dst.iv, align 4 + %src.1.ptr = getelementptr i8, ptr %src.iv, i64 4 + %s1 = load float, ptr %src.1.ptr, align 4 + %dst.1.ptr = getelementptr i8, ptr %dst.iv, i64 4 + store float %s1, ptr %dst.1.ptr, align 4 + %src.2.ptr = getelementptr i8, ptr %src.iv, i64 8 + %s2 = load float, ptr %src.2.ptr, align 4 + %dst.2.ptr = getelementptr i8, ptr %dst.iv, i64 8 + store float %s2, ptr %dst.2.ptr, align 4 + %src.3.ptr = getelementptr i8, ptr %src.iv, i64 12 + %s3 = load float, ptr %src.3.ptr, align 4 + %dst.3.ptr = getelementptr i8, ptr %dst.iv, i64 12 + store float %s3, ptr %dst.3.ptr, align 4 + br label %loop.latch + +loop.latch: + %iv.next = add i32 %iv, 1 + %src.iv.next = getelementptr i8, ptr %src.iv, i64 16 + %dst.iv.next = getelementptr i8, ptr %dst.iv, i64 16 + %ec = icmp eq i32 %iv, %N + br i1 %ec, label %exit, label %loop.header + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll index abfb44d..d290f2d 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll @@ -60,26 +60,32 @@ define void @test_2xi64_with_wide_load(ptr noalias %data, ptr noalias %factor) { ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP4]], align 8 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 2 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = load <2 x i64>, ptr [[TMP1]], align 8 +; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8 ; CHECK-NEXT: [[TMP6:%.*]] = shl nsw i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP7:%.*]] = shl nsw i64 [[TMP0]], 1 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP7]] -; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8 -; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP8]], align 8 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2> +; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3> +; CHECK-NEXT: [[WIDE_VEC3:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC3]], <4 x i64> poison, <2 x i32> <i32 0, i32 2> +; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <4 x i64> [[WIDE_VEC3]], <4 x i64> poison, <2 x i32> <i32 1, i32 3> +; CHECK-NEXT: [[TMP10:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP11:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT3]], [[WIDE_LOAD1]] ; CHECK-NEXT: [[TMP15:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT]], [[STRIDED_VEC2]] ; CHECK-NEXT: [[TMP16:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT3]], [[STRIDED_VEC5]] -; CHECK-NEXT: store <2 x i64> [[TMP15]], ptr [[TMP8]], align 8 -; CHECK-NEXT: store <2 x i64> [[TMP16]], ptr [[TMP9]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <2 x i64> [[TMP10]], <2 x i64> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP17]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; CHECK-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8 +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x i64> [[TMP11]], <2 x i64> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[INTERLEAVED_VEC6:%.*]] = shufflevector <4 x i64> [[TMP18]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; CHECK-NEXT: store <4 x i64> [[INTERLEAVED_VEC6]], ptr [[TMP9]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 ; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll index f2e689c..75980ba 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll @@ -328,8 +328,10 @@ define void @same_live_in_store_interleave_group(i64 %x, ptr noalias %dst) { ; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; VF2-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1 ; VF2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]] -; VF2-NEXT: store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 8 -; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1 +; VF2-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLAT]], <2 x i64> [[BROADCAST_SPLAT]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8 +; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 ; VF2-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; VF2: [[MIDDLE_BLOCK]]: diff --git a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll index c8d20dc..b26e9cf 100644 --- a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll @@ -7,6 +7,7 @@ target triple = "wasm32-unknown-wasi" %struct.TwoInts = type { i32, i32 } %struct.ThreeInts = type { i32, i32, i32 } %struct.FourInts = type { i32, i32, i32, i32 } +%struct.TwoShorts = type { i16, i16 } %struct.ThreeShorts = type { i16, i16, i16 } %struct.FourShorts = type { i16, i16, i16, i16 } %struct.TwoBytes = type { i8, i8 } @@ -14,6 +15,8 @@ target triple = "wasm32-unknown-wasi" %struct.FourBytes = type { i8, i8, i8, i8 } %struct.FiveBytes = type { i8, i8, i8, i8, i8 } %struct.EightBytes = type { i8, i8, i8, i8, i8, i8, i8, i8 } +%struct.TwoFloats = type { float, float } +%struct.FourFloats = type { float, float, float, float } ; CHECK-LABEL: two_ints_same_op ; CHECK: Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 2 at %10 @@ -1350,3 +1353,1000 @@ define hidden void @scale_uv_row_down2_linear(ptr nocapture noundef readonly %0, 34: ; preds = %6, %4 ret void } + +; CHECK-LABEL: two_floats_same_op +; CHECK: LV: Scalar loop costs: 14 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 14. +; CHECK: LV: Vector loop of width 2 costs: 19. +; CHECK: LV: Vector loop of width 4 costs: 15. +; CHECK: LV: Selecting VF: 1. +define hidden void @two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp21.not = icmp eq i32 %N, 0 + br i1 %cmp21.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %arrayidx3 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.022 + store float %mul, ptr %arrayidx3, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %mul8 = fmul float %2, %3 + %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4 + store float %mul8, ptr %y10, align 4 + %inc = add nuw i32 %i.022, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_floats_vary_op +; CHECK: LV: Scalar loop costs: 14 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 14. +; CHECK: LV: Vector loop of width 2 costs: 19. +; CHECK: LV: Vector loop of width 4 costs: 15. +; CHECK: LV: Selecting VF: 1. +define hidden void @two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp20.not = icmp eq i32 %N, 0 + br i1 %cmp20.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.021 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.021 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.021 + %1 = load float, ptr %arrayidx1, align 4 + %add = fadd float %0, %1 + %arrayidx3 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.021 + store float %add, ptr %arrayidx3, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %sub = fsub float %2, %3 + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4 + store float %sub, ptr %y9, align 4 + %inc = add nuw i32 %i.021, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_bytes_two_floats_same_op +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 18 +; CHECK: LV: Vector loop of width 2 costs: 23 +; CHECK: LV: Vector loop of width 4 costs: 13 +; CHECK: LV: Selecting VF: 4. +define hidden void @two_bytes_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp24.not = icmp eq i32 %N, 0 + br i1 %cmp24.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.025 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoBytes, ptr %a, i32 %i.025 + %0 = load i8, ptr %arrayidx, align 1 + %conv = sitofp i8 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.TwoBytes, ptr %b, i32 %i.025 + %1 = load i8, ptr %arrayidx1, align 1 + %conv3 = sitofp i8 %1 to float + %mul = fmul float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.025 + store float %mul, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1 + %2 = load i8, ptr %y, align 1 + %conv7 = sitofp i8 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1 + %3 = load i8, ptr %y9, align 1 + %conv10 = sitofp i8 %3 to float + %mul11 = fmul float %conv7, %conv10 + %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %mul11, ptr %y13, align 4 + %inc = add nuw i32 %i.025, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_bytes_two_floats_vary_op +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 18 +; CHECK: LV: Vector loop of width 2 costs: 23 +; CHECK: LV: Vector loop of width 4 costs: 13 +; CHECK: LV: Selecting VF: 4. +define hidden void @two_bytes_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp23.not = icmp eq i32 %N, 0 + br i1 %cmp23.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.024 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoBytes, ptr %a, i32 %i.024 + %0 = load i8, ptr %arrayidx, align 1 + %conv = sitofp i8 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.TwoBytes, ptr %b, i32 %i.024 + %1 = load i8, ptr %arrayidx1, align 1 + %conv3 = sitofp i8 %1 to float + %add = fadd float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.024 + store float %add, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1 + %2 = load i8, ptr %y, align 1 + %conv7 = sitofp i8 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1 + %3 = load i8, ptr %y9, align 1 + %conv10 = sitofp i8 %3 to float + %sub = fsub float %conv7, %conv10 + %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %sub, ptr %y12, align 4 + %inc = add nuw i32 %i.024, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_floats_two_bytes_same_op +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 16 +; CHECK: LV: Vector loop of width 2 costs: 21 +; CHECK: LV: Vector loop of width 4 costs: 14. +; CHECK: LV: Selecting VF: 4. +define hidden void @two_floats_two_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp22.not = icmp eq i32 %N, 0 + br i1 %cmp22.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.023 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.023 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %conv = fptosi float %mul to i8 + %arrayidx3 = getelementptr inbounds nuw %struct.TwoBytes, ptr %res, i32 %i.023 + store i8 %conv, ptr %arrayidx3, align 1 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %mul8 = fmul float %2, %3 + %conv9 = fptosi float %mul8 to i8 + %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1 + store i8 %conv9, ptr %y11, align 1 + %inc = add nuw i32 %i.023, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_floats_two_bytes_vary_op +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 16 +; CHECK: LV: Vector loop of width 2 costs: 21 +; CHECK: LV: Vector loop of width 4 costs: 14. +; CHECK: LV: Selecting VF: 4. +define hidden void @two_floats_two_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp21.not = icmp eq i32 %N, 0 + br i1 %cmp21.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022 + %1 = load float, ptr %arrayidx1, align 4 + %add = fadd float %0, %1 + %conv = fptosi float %add to i8 + %arrayidx3 = getelementptr inbounds nuw %struct.TwoBytes, ptr %res, i32 %i.022 + store i8 %conv, ptr %arrayidx3, align 1 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %sub = fsub float %2, %3 + %conv8 = fptosi float %sub to i8 + %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1 + store i8 %conv8, ptr %y10, align 1 + %inc = add nuw i32 %i.022, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_shorts_two_floats_same_op +; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 18 +; CHECK: LV: Vector loop of width 2 costs: 22 +; CHECK: LV: Vector loop of width 4 costs: 11. +; CHECK: LV: Selecting VF: 4. +define hidden void @two_shorts_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp24.not = icmp eq i32 %N, 0 + br i1 %cmp24.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.025 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoShorts, ptr %a, i32 %i.025 + %0 = load i16, ptr %arrayidx, align 2 + %conv = sitofp i16 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.TwoShorts, ptr %b, i32 %i.025 + %1 = load i16, ptr %arrayidx1, align 2 + %conv3 = sitofp i16 %1 to float + %mul = fmul float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.025 + store float %mul, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 + %2 = load i16, ptr %y, align 2 + %conv7 = sitofp i16 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 + %3 = load i16, ptr %y9, align 2 + %conv10 = sitofp i16 %3 to float + %mul11 = fmul float %conv7, %conv10 + %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %mul11, ptr %y13, align 4 + %inc = add nuw i32 %i.025, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_shorts_two_floats_vary_op +; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 18 +; CHECK: LV: Vector loop of width 2 costs: 22 +; CHECK: LV: Vector loop of width 4 costs: 11. +; CHECK: LV: Selecting VF: 4. +define hidden void @two_shorts_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp23.not = icmp eq i32 %N, 0 + br i1 %cmp23.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.024 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoShorts, ptr %a, i32 %i.024 + %0 = load i16, ptr %arrayidx, align 2 + %conv = sitofp i16 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.TwoShorts, ptr %b, i32 %i.024 + %1 = load i16, ptr %arrayidx1, align 2 + %conv3 = sitofp i16 %1 to float + %add = fadd float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.024 + store float %add, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 + %2 = load i16, ptr %y, align 2 + %conv7 = sitofp i16 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 + %3 = load i16, ptr %y9, align 2 + %conv10 = sitofp i16 %3 to float + %sub = fsub float %conv7, %conv10 + %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %sub, ptr %y12, align 4 + %inc = add nuw i32 %i.024, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_floats_two_shorts_same_op +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 16 +; CHECK: LV: Vector loop of width 2 costs: 20 +; CHECK: LV: Vector loop of width 4 costs: 13. +; CHECK: LV: Selecting VF: 4. +define hidden void @two_floats_two_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp22.not = icmp eq i32 %N, 0 + br i1 %cmp22.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.023 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.023 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %conv = fptosi float %mul to i16 + %arrayidx3 = getelementptr inbounds nuw %struct.TwoShorts, ptr %res, i32 %i.023 + store i16 %conv, ptr %arrayidx3, align 2 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %mul8 = fmul float %2, %3 + %conv9 = fptosi float %mul8 to i16 + %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2 + store i16 %conv9, ptr %y11, align 2 + %inc = add nuw i32 %i.023, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_floats_two_shorts_vary_op +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 16 +; CHECK: LV: Vector loop of width 2 costs: 20 +; CHECK: LV: Vector loop of width 4 costs: 13. +; CHECK: LV: Selecting VF: 4. +define hidden void @two_floats_two_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp21.not = icmp eq i32 %N, 0 + br i1 %cmp21.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022 + %1 = load float, ptr %arrayidx1, align 4 + %add = fadd float %0, %1 + %conv = fptosi float %add to i16 + %arrayidx3 = getelementptr inbounds nuw %struct.TwoShorts, ptr %res, i32 %i.022 + store i16 %conv, ptr %arrayidx3, align 2 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %sub = fsub float %2, %3 + %conv8 = fptosi float %sub to i16 + %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2 + store i16 %conv8, ptr %y10, align 2 + %inc = add nuw i32 %i.022, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_floats_same_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 24 +; CHECK: LV: Vector loop of width 2 costs: 33 +; CHECK: LV: Vector loop of width 4 costs: 30 +; CHECK: LV: Selecting VF: 1 +define hidden void @four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp45.not = icmp eq i32 %N, 0 + br i1 %cmp45.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %arrayidx3 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.046 + store float %mul, ptr %arrayidx3, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %mul8 = fmul float %2, %3 + %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4 + store float %mul8, ptr %y10, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8 + %4 = load float, ptr %z, align 4 + %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8 + %5 = load float, ptr %z13, align 4 + %mul14 = fmul float %4, %5 + %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 8 + store float %mul14, ptr %z16, align 4 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12 + %6 = load float, ptr %w, align 4 + %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12 + %7 = load float, ptr %w19, align 4 + %mul20 = fmul float %6, %7 + %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 12 + store float %mul20, ptr %w22, align 4 + %inc = add nuw i32 %i.046, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_floats_vary_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 24 +; CHECK: LV: Vector loop of width 2 costs: 33 +; CHECK: LV: Vector loop of width 4 costs: 30 +; CHECK: LV: Selecting VF: 1 +define hidden void @four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp42.not = icmp eq i32 %N, 0 + br i1 %cmp42.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.043 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.043 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.043 + %1 = load float, ptr %arrayidx1, align 4 + %add = fadd float %0, %1 + %arrayidx3 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.043 + store float %add, ptr %arrayidx3, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %sub = fsub float %2, %3 + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4 + store float %sub, ptr %y9, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8 + %4 = load float, ptr %z, align 4 + %z12 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8 + %5 = load float, ptr %z12, align 4 + %mul = fmul float %4, %5 + %z14 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 8 + store float %mul, ptr %z14, align 4 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12 + %6 = load float, ptr %w, align 4 + %w17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12 + %7 = load float, ptr %w17, align 4 + %div = fdiv float %6, %7 + %w19 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 12 + store float %div, ptr %w19, align 4 + %inc = add nuw i32 %i.043, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_bytes_four_floats_same_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 32 +; CHECK: LV: Vector loop of width 2 costs: 43 +; CHECK: LV: Vector loop of width 4 costs: 23 +; CHECK: LV: Selecting VF: 4 +define hidden void @four_bytes_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp52.not = icmp eq i32 %N, 0 + br i1 %cmp52.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.053 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourBytes, ptr %a, i32 %i.053 + %0 = load i8, ptr %arrayidx, align 1 + %conv = sitofp i8 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.FourBytes, ptr %b, i32 %i.053 + %1 = load i8, ptr %arrayidx1, align 1 + %conv3 = sitofp i8 %1 to float + %mul = fmul float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.053 + store float %mul, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1 + %2 = load i8, ptr %y, align 1 + %conv7 = sitofp i8 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1 + %3 = load i8, ptr %y9, align 1 + %conv10 = sitofp i8 %3 to float + %mul11 = fmul float %conv7, %conv10 + %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %mul11, ptr %y13, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 + %4 = load i8, ptr %z, align 1 + %conv15 = sitofp i8 %4 to float + %z17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 + %5 = load i8, ptr %z17, align 1 + %conv18 = sitofp i8 %5 to float + %mul19 = fmul float %conv15, %conv18 + %z21 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8 + store float %mul19, ptr %z21, align 4 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 3 + %6 = load i8, ptr %w, align 1 + %conv23 = sitofp i8 %6 to float + %w25 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 3 + %7 = load i8, ptr %w25, align 1 + %conv26 = sitofp i8 %7 to float + %mul27 = fmul float %conv23, %conv26 + %w29 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12 + store float %mul27, ptr %w29, align 4 + %inc = add nuw i32 %i.053, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_bytes_four_floats_vary_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 32 +; CHECK: LV: Vector loop of width 2 costs: 43 +; CHECK: LV: Vector loop of width 4 costs: 23 +; CHECK: LV: Selecting VF: 4 +define hidden void @four_bytes_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp49.not = icmp eq i32 %N, 0 + br i1 %cmp49.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.050 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourBytes, ptr %a, i32 %i.050 + %0 = load i8, ptr %arrayidx, align 1 + %conv = sitofp i8 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.FourBytes, ptr %b, i32 %i.050 + %1 = load i8, ptr %arrayidx1, align 1 + %conv3 = sitofp i8 %1 to float + %mul = fmul float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.050 + store float %mul, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1 + %2 = load i8, ptr %y, align 1 + %conv7 = sitofp i8 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1 + %3 = load i8, ptr %y9, align 1 + %conv10 = sitofp i8 %3 to float + %add = fadd float %conv7, %conv10 + %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %add, ptr %y12, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 + %4 = load i8, ptr %z, align 1 + %conv14 = sitofp i8 %4 to float + %z16 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 + %5 = load i8, ptr %z16, align 1 + %conv17 = sitofp i8 %5 to float + %div = fdiv float %conv14, %conv17 + %z19 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8 + store float %div, ptr %z19, align 4 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 3 + %6 = load i8, ptr %w, align 1 + %conv21 = sitofp i8 %6 to float + %w23 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 3 + %7 = load i8, ptr %w23, align 1 + %conv24 = sitofp i8 %7 to float + %sub = fsub float %conv21, %conv24 + %w26 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12 + store float %sub, ptr %w26, align 4 + %inc = add nuw i32 %i.050, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_floats_four_bytes_same_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 28 +; CHECK: LV: Vector loop of width 2 costs: 38 +; CHECK: LV: Vector loop of width 4 costs: 26 +; CHECK: LV: Selecting VF: 4 +define hidden void @four_floats_four_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp48.not = icmp eq i32 %N, 0 + br i1 %cmp48.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.049 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.049 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.049 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %conv = fptosi float %mul to i8 + %arrayidx3 = getelementptr inbounds nuw %struct.FourBytes, ptr %res, i32 %i.049 + store i8 %conv, ptr %arrayidx3, align 1 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %mul8 = fmul float %2, %3 + %conv9 = fptosi float %mul8 to i8 + %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1 + store i8 %conv9, ptr %y11, align 1 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8 + %4 = load float, ptr %z, align 4 + %z14 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8 + %5 = load float, ptr %z14, align 4 + %mul15 = fmul float %4, %5 + %conv16 = fptosi float %mul15 to i8 + %z18 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2 + store i8 %conv16, ptr %z18, align 1 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12 + %6 = load float, ptr %w, align 4 + %w21 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12 + %7 = load float, ptr %w21, align 4 + %mul22 = fmul float %6, %7 + %conv23 = fptosi float %mul22 to i8 + %w25 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 3 + store i8 %conv23, ptr %w25, align 1 + %inc = add nuw i32 %i.049, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_floats_four_bytes_vary_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 28 +; CHECK: LV: Vector loop of width 2 costs: 38 +; CHECK: LV: Vector loop of width 4 costs: 26 +; CHECK: LV: Selecting VF: 4 +define hidden void @four_floats_four_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp45.not = icmp eq i32 %N, 0 + br i1 %cmp45.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %conv = fptosi float %mul to i8 + %arrayidx3 = getelementptr inbounds nuw %struct.FourBytes, ptr %res, i32 %i.046 + store i8 %conv, ptr %arrayidx3, align 1 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %add = fadd float %2, %3 + %conv8 = fptosi float %add to i8 + %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1 + store i8 %conv8, ptr %y10, align 1 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8 + %4 = load float, ptr %z, align 4 + %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8 + %5 = load float, ptr %z13, align 4 + %div = fdiv float %4, %5 + %conv14 = fptosi float %div to i8 + %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2 + store i8 %conv14, ptr %z16, align 1 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12 + %6 = load float, ptr %w, align 4 + %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12 + %7 = load float, ptr %w19, align 4 + %sub = fsub float %6, %7 + %conv20 = fptosi float %sub to i8 + %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 3 + store i8 %conv20, ptr %w22, align 1 + %inc = add nuw i32 %i.046, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_shorts_four_floats_same_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 32 +; CHECK: LV: Vector loop of width 2 costs: 37 +; CHECK: LV: Vector loop of width 4 costs: 23 +; CHECK: LV: Selecting VF: 4 +define hidden void @four_shorts_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp52.not = icmp eq i32 %N, 0 + br i1 %cmp52.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.053 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourShorts, ptr %a, i32 %i.053 + %0 = load i16, ptr %arrayidx, align 2 + %conv = sitofp i16 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.FourShorts, ptr %b, i32 %i.053 + %1 = load i16, ptr %arrayidx1, align 2 + %conv3 = sitofp i16 %1 to float + %mul = fmul float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.053 + store float %mul, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 + %2 = load i16, ptr %y, align 2 + %conv7 = sitofp i16 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 + %3 = load i16, ptr %y9, align 2 + %conv10 = sitofp i16 %3 to float + %mul11 = fmul float %conv7, %conv10 + %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %mul11, ptr %y13, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %4 = load i16, ptr %z, align 2 + %conv15 = sitofp i16 %4 to float + %z17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %5 = load i16, ptr %z17, align 2 + %conv18 = sitofp i16 %5 to float + %mul19 = fmul float %conv15, %conv18 + %z21 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8 + store float %mul19, ptr %z21, align 4 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 6 + %6 = load i16, ptr %w, align 2 + %conv23 = sitofp i16 %6 to float + %w25 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 6 + %7 = load i16, ptr %w25, align 2 + %conv26 = sitofp i16 %7 to float + %mul27 = fmul float %conv23, %conv26 + %w29 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12 + store float %mul27, ptr %w29, align 4 + %inc = add nuw i32 %i.053, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_shorts_four_floats_vary_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 32 +; CHECK: LV: Vector loop of width 2 costs: 37 +; CHECK: LV: Vector loop of width 4 costs: 23 +; CHECK: LV: Selecting VF: 4 +define hidden void @four_shorts_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp49.not = icmp eq i32 %N, 0 + br i1 %cmp49.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.050 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourShorts, ptr %a, i32 %i.050 + %0 = load i16, ptr %arrayidx, align 2 + %conv = sitofp i16 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.FourShorts, ptr %b, i32 %i.050 + %1 = load i16, ptr %arrayidx1, align 2 + %conv3 = sitofp i16 %1 to float + %mul = fmul float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.050 + store float %mul, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 + %2 = load i16, ptr %y, align 2 + %conv7 = sitofp i16 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 + %3 = load i16, ptr %y9, align 2 + %conv10 = sitofp i16 %3 to float + %add = fadd float %conv7, %conv10 + %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %add, ptr %y12, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %4 = load i16, ptr %z, align 2 + %conv14 = sitofp i16 %4 to float + %z16 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %5 = load i16, ptr %z16, align 2 + %conv17 = sitofp i16 %5 to float + %div = fdiv float %conv14, %conv17 + %z19 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8 + store float %div, ptr %z19, align 4 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 6 + %6 = load i16, ptr %w, align 2 + %conv21 = sitofp i16 %6 to float + %w23 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 6 + %7 = load i16, ptr %w23, align 2 + %conv24 = sitofp i16 %7 to float + %sub = fsub float %conv21, %conv24 + %w26 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12 + store float %sub, ptr %w26, align 4 + %inc = add nuw i32 %i.050, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_floats_four_shorts_same_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 28 +; CHECK: LV: Vector loop of width 2 costs: 35 +; CHECK: LV: Vector loop of width 4 costs: 26 +; CHECK: LV: Selecting VF: 4 +define hidden void @four_floats_four_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp48.not = icmp eq i32 %N, 0 + br i1 %cmp48.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.049 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.049 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.049 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %conv = fptosi float %mul to i16 + %arrayidx3 = getelementptr inbounds nuw %struct.FourShorts, ptr %res, i32 %i.049 + store i16 %conv, ptr %arrayidx3, align 2 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %mul8 = fmul float %2, %3 + %conv9 = fptosi float %mul8 to i16 + %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2 + store i16 %conv9, ptr %y11, align 2 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8 + %4 = load float, ptr %z, align 4 + %z14 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8 + %5 = load float, ptr %z14, align 4 + %mul15 = fmul float %4, %5 + %conv16 = fptosi float %mul15 to i16 + %z18 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4 + store i16 %conv16, ptr %z18, align 2 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12 + %6 = load float, ptr %w, align 4 + %w21 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12 + %7 = load float, ptr %w21, align 4 + %mul22 = fmul float %6, %7 + %conv23 = fptosi float %mul22 to i16 + %w25 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 6 + store i16 %conv23, ptr %w25, align 2 + %inc = add nuw i32 %i.049, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_floats_four_shorts_vary_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 28 +; CHECK: LV: Vector loop of width 2 costs: 35 +; CHECK: LV: Vector loop of width 4 costs: 26 +; CHECK: LV: Selecting VF: 4 +define hidden void @four_floats_four_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp45.not = icmp eq i32 %N, 0 + br i1 %cmp45.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %conv = fptosi float %mul to i16 + %arrayidx3 = getelementptr inbounds nuw %struct.FourShorts, ptr %res, i32 %i.046 + store i16 %conv, ptr %arrayidx3, align 2 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %add = fadd float %2, %3 + %conv8 = fptosi float %add to i16 + %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2 + store i16 %conv8, ptr %y10, align 2 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8 + %4 = load float, ptr %z, align 4 + %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8 + %5 = load float, ptr %z13, align 4 + %div = fdiv float %4, %5 + %conv14 = fptosi float %div to i16 + %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4 + store i16 %conv14, ptr %z16, align 2 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12 + %6 = load float, ptr %w, align 4 + %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12 + %7 = load float, ptr %w19, align 4 + %sub = fsub float %6, %7 + %conv20 = fptosi float %sub to i16 + %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 6 + store i16 %conv20, ptr %w22, align 2 + %inc = add nuw i32 %i.046, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} diff --git a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll index f4d80af..2a3ce03 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll @@ -8,69 +8,15 @@ target triple = "x86_64-unknown-linux" define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) { ; CHECK-LABEL: define void @test_4xi64( ; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[ITER_CHECK:.*]]: +; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] -; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: -; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 16 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[N_MOD_VF1:%.*]] = urem i64 [[N]], 16 -; CHECK-NEXT: [[N_VEC1:%.*]] = sub i64 [[N]], [[N_MOD_VF1]] -; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] -; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT1:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP20]], align 8 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP7]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP21]], align 8 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i64> poison, i64 [[TMP8]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT5]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP22]], align 8 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i64> poison, i64 [[TMP9]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT7]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x i64> poison, i64 [[TMP10]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT9]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[INDEX]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[TMP2]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8 -; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP12]], align 8 -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP13]], align 8 -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP23]], align 8 -; CHECK-NEXT: [[TMP15:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT1]], [[WIDE_LOAD1]] -; CHECK-NEXT: [[TMP16:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT6]], [[WIDE_LOAD2]] -; CHECK-NEXT: [[TMP17:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT8]], [[WIDE_LOAD3]] -; CHECK-NEXT: [[TMP18:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT10]], [[WIDE_LOAD4]] -; CHECK-NEXT: store <4 x i64> [[TMP15]], ptr [[TMP11]], align 8 -; CHECK-NEXT: store <4 x i64> [[TMP16]], ptr [[TMP12]], align 8 -; CHECK-NEXT: store <4 x i64> [[TMP17]], ptr [[TMP13]], align 8 -; CHECK-NEXT: store <4 x i64> [[TMP18]], ptr [[TMP23]], align 8 -; CHECK-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC1]] -; CHECK-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[CMP_N1:%.*]] = icmp eq i64 [[N]], [[N_VEC1]] -; CHECK-NEXT: br i1 [[CMP_N1]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] -; CHECK: [[VEC_EPILOG_ITER_CHECK]]: -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF1]], 4 -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] -; CHECK: [[VEC_EPILOG_PH]]: -; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC1]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] -; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]] ; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP5]], i64 0 @@ -81,15 +27,15 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) ; CHECK-NEXT: store <4 x i64> [[TMP4]], ptr [[TMP3]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 1 ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP14]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] -; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: +; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] -; CHECK: [[VEC_EPILOG_SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC1]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[DATA_2:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]] ; CHECK-NEXT: [[L_2:%.*]] = load i64, ptr [[DATA_2]], align 8 ; CHECK-NEXT: [[DATA_0:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 0 @@ -110,7 +56,7 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) ; CHECK-NEXT: store i64 [[MUL_3]], ptr [[DATA_3]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -171,7 +117,7 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) ; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -194,7 +140,7 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) ; CHECK-NEXT: store i64 [[MUL_1]], ptr [[DATA_1]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -249,7 +195,7 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no ; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -272,7 +218,7 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no ; CHECK-NEXT: store i64 [[MUL_1]], ptr [[DATA_1]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -327,7 +273,7 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa ; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -350,7 +296,7 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa ; CHECK-NEXT: store i64 [[MUL_0]], ptr [[DATA_1]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -405,7 +351,7 @@ define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %fa ; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -428,7 +374,7 @@ define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %fa ; CHECK-NEXT: store i64 [[MUL_1]], ptr [[DATA_0]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -489,7 +435,7 @@ define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noal ; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP16]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: @@ -513,7 +459,7 @@ define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noal ; CHECK-NEXT: store i64 [[MUL_1]], ptr [[DATA_1]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -573,7 +519,7 @@ define void @test_3xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) ; CHECK-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -598,7 +544,7 @@ define void @test_3xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) ; CHECK-NEXT: store i64 [[MUL_2]], ptr [[DATA_2]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -707,7 +653,7 @@ define void @test_3xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n) ; CHECK-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8 ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: @@ -731,7 +677,7 @@ define void @test_3xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n) ; CHECK-NEXT: store i32 [[MUL_2]], ptr [[DATA_2]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -765,20 +711,19 @@ exit: ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} -; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]} -; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]} -; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]} -; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]} -; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META1]]} -; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]} -; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META2]], [[META1]]} -; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]]} -; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META2]], [[META1]]} -; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]], [[META2]]} -; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META2]], [[META1]]} -; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]], [[META2]]} -; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META2]], [[META1]]} -; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]} -; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META2]], [[META1]]} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} +; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} +; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} +; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]} +; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]} +; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]} +; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll b/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll new file mode 100644 index 0000000..01934b1 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll @@ -0,0 +1,30 @@ +; RUN: opt -passes=loop-vectorize -pass-remarks-analysis=loop-vectorize -S < %s 2>&1 | FileCheck %s + +; Make sure the unsafe user specified interleave count is ignored. + +; CHECK: remark: <unknown>:0:0: Ignoring user-specified interleave count due to possibly unsafe dependencies in the loop. +; CHECK-LABEL: @loop_distance_4 +define void @loop_distance_4(ptr %a, ptr %b) { +entry: + br label %loop + +loop: + %iv = phi i64 [ 4, %entry ], [ %iv.next, %loop ] + %0 = getelementptr i32, ptr %b, i64 %iv + %arrayidx = getelementptr i8, ptr %0, i64 -16 + %1 = load i32, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds nuw i32, ptr %a, i64 %iv + %2 = load i32, ptr %arrayidx2, align 4 + %add = add nsw i32 %2, %1 + store i32 %add, ptr %0, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 64 + br i1 %exitcond.not, label %for.end, label %loop, !llvm.loop !1 + +for.end: + ret void +} + +!1 = !{!1, !2, !3} +!2 = !{!"llvm.loop.interleave.count", i32 4} +!3 = !{!"llvm.loop.vectorize.width", i32 4} diff --git a/llvm/test/Transforms/PhaseOrdering/always-inline-alloca-promotion.ll b/llvm/test/Transforms/PhaseOrdering/always-inline-alloca-promotion.ll index 63279b0..92ea2c6 100644 --- a/llvm/test/Transforms/PhaseOrdering/always-inline-alloca-promotion.ll +++ b/llvm/test/Transforms/PhaseOrdering/always-inline-alloca-promotion.ll @@ -35,9 +35,8 @@ define void @ham() #1 { ; CHECK-NEXT: [[SNORK_EXIT:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr inttoptr (i64 48 to ptr), align 16 ; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i64 [[TMP0]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i64 0) -; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[TMP1]], <vscale x 16 x float> [[TMP2]], <vscale x 16 x float> undef -; CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[SPEC_SELECT]], i64 0) +; CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> undef, i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP1]], <vscale x 4 x float> zeroinitializer, <vscale x 4 x float> [[TMP2]] ; CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.nxv4f32(i32 0, <vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, <vscale x 4 x float> zeroinitializer, <vscale x 4 x float> [[TMP3]]) ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SCCP/conditions-ranges.ll b/llvm/test/Transforms/SCCP/conditions-ranges.ll index a3cf23b..f793814 100644 --- a/llvm/test/Transforms/SCCP/conditions-ranges.ll +++ b/llvm/test/Transforms/SCCP/conditions-ranges.ll @@ -1547,3 +1547,28 @@ bb2: call void @use(i1 %c4) ret void } + +define i1 @and_predicate_dominating_phi(i32 %x) { +; CHECK-LABEL: @and_predicate_dominating_phi( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[XGE1:%.*]] = icmp uge i32 [[X:%.*]], 1 +; CHECK-NEXT: [[XLT2:%.*]] = icmp ult i32 [[X]], 2 +; CHECK-NEXT: [[AND:%.*]] = and i1 [[XGE1]], [[XLT2]] +; CHECK-NEXT: br i1 [[AND]], label [[PHI:%.*]], label [[NOPE:%.*]] +; CHECK: nope: +; CHECK-NEXT: br label [[PHI]] +; CHECK: phi: +; CHECK-NEXT: ret i1 true +; +entry: + %xge1 = icmp uge i32 %x, 1 + %xlt2 = icmp ult i32 %x, 2 + %and = and i1 %xge1, %xlt2 + br i1 %and, label %phi, label %nope +nope: + br label %phi +phi: + %res = phi i32 [ %x, %entry ], [ 1, %nope ] + %ret = icmp uge i32 %res, 1 + ret i1 %ret +} diff --git a/llvm/test/Transforms/SimplifyCFG/AArch64/prefer-fma.ll b/llvm/test/Transforms/SimplifyCFG/AArch64/prefer-fma.ll index 0f18dc2..46e38d9 100644 --- a/llvm/test/Transforms/SimplifyCFG/AArch64/prefer-fma.ll +++ b/llvm/test/Transforms/SimplifyCFG/AArch64/prefer-fma.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -mtriple=aarch64-linux-gnu -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -hoist-common-insts=true -enable-unsafe-fp-math -S >%t +; RUN: opt < %s -mtriple=aarch64-linux-gnu -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -hoist-common-insts=true -S >%t ; RUN: FileCheck %s < %t ; ModuleID = 't.cc' diff --git a/llvm/test/Transforms/SimplifyCFG/PowerPC/prefer-fma.ll b/llvm/test/Transforms/SimplifyCFG/PowerPC/prefer-fma.ll index c7bc43e1..b61d659 100644 --- a/llvm/test/Transforms/SimplifyCFG/PowerPC/prefer-fma.ll +++ b/llvm/test/Transforms/SimplifyCFG/PowerPC/prefer-fma.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=powerpc64le-unknown-linux-gnu -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -hoist-common-insts=true -enable-unsafe-fp-math -S | \ +; RUN: opt < %s -mtriple=powerpc64le-unknown-linux-gnu -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -hoist-common-insts=true -S | \ ; RUN: FileCheck %s ; This case is copied from test/Transforms/SimplifyCFG/AArch64/ diff --git a/llvm/test/Transforms/Util/PredicateInfo/testandor.ll b/llvm/test/Transforms/Util/PredicateInfo/testandor.ll index 2e96a92..cc1dc4e 100644 --- a/llvm/test/Transforms/Util/PredicateInfo/testandor.ll +++ b/llvm/test/Transforms/Util/PredicateInfo/testandor.ll @@ -994,3 +994,30 @@ define void @test_assume_deep_and_tree(i1 %a1) { call void @foo(i1 %a15) ret void } + +define i32 @test_and_with_phinode(i32 %x) { +; CHECK-LABEL: @test_and_with_phinode( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[XGE1:%.*]] = icmp uge i32 [[X:%.*]], 1 +; CHECK-NEXT: [[XLT2:%.*]] = icmp ult i32 [[X]], 2 +; CHECK-NEXT: [[AND:%.*]] = and i1 [[XGE1]], [[XLT2]] +; CHECK: [[X_0_1:%.*]] = bitcast i32 [[X]] to i32 +; CHECK: [[X_0_2:%.*]] = bitcast i32 [[X_0_1]] to i32 +; CHECK-NEXT: br i1 [[AND]], label [[PHI:%.*]], label [[NOPE:%.*]] +; CHECK: nope: +; CHECK-NEXT: br label [[PHI]] +; CHECK: phi: +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[X_0_2]], [[ENTRY:%.*]] ], [ 1, [[NOPE]] ] +; CHECK-NEXT: ret i32 [[RES]] +; +entry: + %xge1 = icmp uge i32 %x, 1 + %xlt2 = icmp ult i32 %x, 2 + %and = and i1 %xge1, %xlt2 + br i1 %and, label %phi, label %nope +nope: + br label %phi +phi: + %res = phi i32 [ %x, %entry ], [ 1, %nope ] + ret i32 %res +} diff --git a/llvm/test/Transforms/Util/dbg-user-of-aext.ll b/llvm/test/Transforms/Util/dbg-user-of-aext.ll index 9e7935e..b3d1b90 100644 --- a/llvm/test/Transforms/Util/dbg-user-of-aext.ll +++ b/llvm/test/Transforms/Util/dbg-user-of-aext.ll @@ -27,7 +27,7 @@ %struct.foo = type { i8, i64 } ; Function Attrs: noinline nounwind uwtable -define void @_Z1fbb3foo(i1 zeroext %b, i1 zeroext %frag, i8 %g.coerce0, i64 %g.coerce1) #0 !dbg !6 { +define void @_Z1fbb3foo(i1 zeroext %b, i1 zeroext %frag, i8 %g.coerce0, i64 %g.coerce1) !dbg !6 { entry: %g = alloca %struct.foo, align 8 %b.addr = alloca i8, align 1 @@ -51,10 +51,7 @@ entry: ; CHECK: ![[VAR_FRAG]] = !DILocalVariable(name: "frag" ; Function Attrs: nounwind readnone speculatable -declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 - -attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone speculatable } +declare void @llvm.dbg.declare(metadata, metadata, metadata) !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4} diff --git a/llvm/test/Transforms/Util/libcalls-fast-math-inf-loop.ll b/llvm/test/Transforms/Util/libcalls-fast-math-inf-loop.ll index ad23bf7..e9f0c8c 100644 --- a/llvm/test/Transforms/Util/libcalls-fast-math-inf-loop.ll +++ b/llvm/test/Transforms/Util/libcalls-fast-math-inf-loop.ll @@ -19,18 +19,18 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-unknown" ; Function Attrs: nounwind -define float @fn(float %f) #0 { +define float @fn(float %f) { ; CHECK: define float @fn( ; CHECK: call fast float @expf( %f.addr = alloca float, align 4 store float %f, ptr %f.addr, align 4, !tbaa !1 %1 = load float, ptr %f.addr, align 4, !tbaa !1 - %call = call fast float @expf(float %1) #3 + %call = call fast float @expf(float %1) ret float %call } ; Function Attrs: inlinehint nounwind readnone -define available_externally float @expf(float %x) #1 { +define available_externally float @expf(float %x) { ; CHECK: define available_externally float @expf( ; CHECK: fpext float ; CHECK: call fast double @exp( @@ -39,17 +39,13 @@ define available_externally float @expf(float %x) #1 { store float %x, ptr %x.addr, align 4, !tbaa !1 %1 = load float, ptr %x.addr, align 4, !tbaa !1 %conv = fpext float %1 to double - %call = call fast double @exp(double %conv) #3 + %call = call fast double @exp(double %conv) %conv1 = fptrunc double %call to float ret float %conv1 } ; Function Attrs: nounwind readnone -declare double @exp(double) #2 - -attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { inlinehint nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind readnone } +declare double @exp(double) !llvm.ident = !{!0} diff --git a/llvm/test/Transforms/WholeProgramDevirt/speculative-devirt-single-impl.ll b/llvm/test/Transforms/WholeProgramDevirt/speculative-devirt-single-impl.ll new file mode 100644 index 0000000..10566ae --- /dev/null +++ b/llvm/test/Transforms/WholeProgramDevirt/speculative-devirt-single-impl.ll @@ -0,0 +1,132 @@ +; -stats requires asserts +; REQUIRES: asserts + +; Check that we can still devirtualize outside LTO mode when speculative devirtualization is enabled. +; Check that we skip devirtualization for empty functions in speculative devirtualization mode + +; RUN: opt -S -passes=wholeprogramdevirt -devirtualize-speculatively \ +; RUN: -pass-remarks=wholeprogramdevirt -stats %s 2>&1 | FileCheck %s + +target datalayout = "e-p:64:64" +target triple = "x86_64-unknown-linux-gnu" + +; CHECK: remark: devirt-single.cc:30:32: single-impl: devirtualized a call to vf +; CHECK: remark: devirt-single.cc:41:32: single-impl: devirtualized a call to vf +; CHECK: remark: devirt-single.cc:51:32: single-impl: devirtualized a call to vf +; CHECK: remark: devirt-single.cc:13:0: devirtualized vf +; CHECK-NOT: devirtualized + +@vt1 = constant [1 x ptr] [ptr @vf], !type !8 +@vt2 = constant [1 x ptr] [ptr @vf_empty], !type !12 + +define i1 @vf(ptr %this) #0 !dbg !7 { + ret i1 true +} + +; This should NOT be devirtualized because during non-lto empty functions +; are skipped. +define void @vf_empty(ptr %this) !dbg !11 { + ret void +} + +; CHECK: define void @call +define void @call(ptr %obj) #1 !dbg !5 { + %vtable = load ptr, ptr %obj + %p = call i1 @llvm.public.type.test(ptr %vtable, metadata !"typeid") + call void @llvm.assume(i1 %p) + %fptr = load ptr, ptr %vtable + ; CHECK: if.true.direct_targ: + ; CHECK: call i1 @vf( + ; CHECK: if.false.orig_indirect: + ; CHECK: call i1 %fptr( + call i1 %fptr(ptr %obj), !dbg !6 + ret void +} + + +; CHECK: define void @call1 +define void @call1(ptr %obj) #1 !dbg !9 { + %vtable = load ptr, ptr %obj + %p = call i1 @llvm.type.test(ptr %vtable, metadata !"typeid1") + call void @llvm.assume(i1 %p) + %fptr = load ptr, ptr %vtable, align 8 + ; CHECK: call i1 %fptr + %1 = call i1 %fptr(ptr %obj), !dbg !10 + ret void +} +declare ptr @llvm.load.relative.i32(ptr, i32) + +@vt3 = private unnamed_addr constant [1 x i32] [ + i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @vf to i64), i64 ptrtoint (ptr @vt3 to i64)) to i32) +], align 4, !type !15 + +; CHECK: define void @call2 +define void @call2(ptr %obj) #1 !dbg !13 { + %vtable = load ptr, ptr %obj + %p = call i1 @llvm.type.test(ptr %vtable, metadata !"typeid2") + call void @llvm.assume(i1 %p) + %fptr = call ptr @llvm.load.relative.i32(ptr %vtable, i32 0) + ; CHECK: if.true.direct_targ: + ; CHECK: call i1 @vf( + ; CHECK: if.false.orig_indirect: + ; CHECK: call i1 %fptr( + call i1 %fptr(ptr %obj), !dbg !14 + ret void +} + +@_ZTV1A.local = private unnamed_addr constant { [3 x i32] } { [3 x i32] [ + i32 0, ; offset to top + i32 0, ; rtti + i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @vf to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [3 x i32] }, ptr @_ZTV1A.local, i32 0, i32 0, i32 2) to i64)) to i32) ; vf_emptyunc offset +] }, align 4, !type !18 + +; CHECK: define void @call3 +define void @call3(ptr %obj) #1 !dbg !16 { + %vtable = load ptr, ptr %obj + %p = call i1 @llvm.type.test(ptr %vtable, metadata !"typeid3") + call void @llvm.assume(i1 %p) + %fptr = call ptr @llvm.load.relative.i32(ptr %vtable, i32 8) + ; CHECK: if.true.direct_targ: + ; CHECK: call i1 @vf( + ; CHECK: if.false.orig_indirect: + ; CHECK: call i1 %fptr( + call i1 %fptr(ptr %obj), !dbg !17 + ret void +} + + +declare i1 @llvm.type.test(ptr, metadata) +declare i1 @llvm.public.type.test(ptr, metadata) +declare void @llvm.assume(i1) + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 4.0.0 (trunk 278098)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug) +!1 = !DIFile(filename: "devirt-single.cc", directory: ".") +!2 = !{i32 2, !"Dwarf Version", i32 4} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{!"clang version 4.0.0 (trunk 278098)"} +!5 = distinct !DISubprogram(name: "call", linkageName: "_Z4callPv", scope: !1, file: !1, line: 29, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: false, unit: !0) +!6 = !DILocation(line: 30, column: 32, scope: !5) +!7 = distinct !DISubprogram(name: "vf", linkageName: "_ZN3vt12vfEv", scope: !1, file: !1, line: 13, isLocal: false, isDefinition: true, scopeLine: 13, flags: DIFlagPrototyped, isOptimized: false, unit: !0) +!8 = !{i32 0, !"typeid"} + +!9 = distinct !DISubprogram(name: "call1", linkageName: "_Z5call1Pv", scope: !1, file: !1, line: 31, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: false, unit: !0) +!10 = !DILocation(line: 35, column: 32, scope: !9) +!11 = distinct !DISubprogram(name: "vf_empty", linkageName: "_ZN3vt18vf_emptyEv", scope: !1, file: !1, line: 23, isLocal: false, isDefinition: true, scopeLine: 23, flags: DIFlagPrototyped, isOptimized: false, unit: !0) +!12 = !{i32 0, !"typeid1"} + +!13 = distinct !DISubprogram(name: "call2", linkageName: "_Z5call2Pv", scope: !1, file: !1, line: 40, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: false, unit: !0) +!14 = !DILocation(line: 41, column: 32, scope: !13) +!15 = !{i32 0, !"typeid2"} + +!16 = distinct !DISubprogram(name: "call3", linkageName: "_Z5call3Pv", scope: !1, file: !1, line: 50, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: false, unit: !0) +!17 = !DILocation(line: 51, column: 32, scope: !16) +!18 = !{i32 0, !"typeid3"} + + + +; CHECK: 1 wholeprogramdevirt - Number of whole program devirtualization targets +; CHECK: 3 wholeprogramdevirt - Number of single implementation devirtualizations diff --git a/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll b/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll index d8f5c91..8327e1c 100644 --- a/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll +++ b/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll @@ -11,6 +11,9 @@ ; Check wildcard ; RUN: opt -S -passes=wholeprogramdevirt -whole-program-visibility -pass-remarks=wholeprogramdevirt -wholeprogramdevirt-skip=vf?i1 %s 2>&1 | FileCheck %s --check-prefix=SKIP +; Check that no stats are reported in speculative devirtualization mode as the virtual const prop is disabled. +; RUN: opt -S -passes=wholeprogramdevirt -devirtualize-speculatively -stats %s 2>&1 | FileCheck %s --check-prefix=CHECK-SPECULATIVE-WPD + target datalayout = "e-p:64:64" target triple = "x86_64-unknown-linux-gnu" @@ -225,3 +228,7 @@ declare ptr @llvm.load.relative.i32(ptr, i32) ; CHECK: 2 wholeprogramdevirt - Number of unique return value optimizations ; CHECK: 2 wholeprogramdevirt - Number of virtual constant propagations ; CHECK: 2 wholeprogramdevirt - Number of 1 bit virtual constant propagations + +; CHECK-SPECULATIVE-WPD-NOT: 0 wholeprogramdevirt - Number of unique return value optimizations +; CHECK-SPECULATIVE-WPD-NOT: 0 wholeprogramdevirt - Number of virtual constant propagations +; CHECK-SPECULATIVE-WPD-NOT: 0 wholeprogramdevirt - Number of 1 bit virtual constant propagations |