diff options
Diffstat (limited to 'llvm/test')
207 files changed, 10337 insertions, 5309 deletions
diff --git a/llvm/test/Analysis/BasicAA/matrix-intrinsics.ll b/llvm/test/Analysis/BasicAA/matrix-intrinsics.ll new file mode 100644 index 0000000..1de8ab5 --- /dev/null +++ b/llvm/test/Analysis/BasicAA/matrix-intrinsics.ll @@ -0,0 +1,30 @@ +; RUN: opt %s -aa-pipeline=basic-aa -passes=aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s + +; BasicAA should prove that loads from sufficiently large static offsets +; don't overlap with matrix loads with a statically known size. + +define <8 x double> @non_overlapping_strided_load(ptr %src) { +; CHECK-LABEL: Function: non_overlapping_strided_load: +; Just Ref: %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) <-> call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2) +; Just Mod: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2) <-> %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) +entry: + %src.offset = getelementptr inbounds double, ptr %src, i32 12 + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2) + ret <8 x double> %l +} + +define <8 x double> @overlapping_strided_load(ptr %src) { +; CHECK-LABEL: Function: overlapping_strided_load: +; CHECK: Just Ref: %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) <-> call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2) +; CHECK: Just Mod: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2) <-> %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) +; +entry: + %src.offset = getelementptr inbounds double, ptr %src, i32 11 + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2) + ret <8 x double> %l +} + +declare <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr, i32, i1, i32, i32) +declare void @llvm.matrix.column.major.store.v8f64.i32(<8 x double>, ptr, i32, i1, i32, i32) diff --git a/llvm/test/Assembler/autoupgrade-invalid-masked-align.ll b/llvm/test/Assembler/autoupgrade-invalid-masked-align.ll new file mode 100644 index 0000000..458bd2e --- /dev/null +++ b/llvm/test/Assembler/autoupgrade-invalid-masked-align.ll @@ -0,0 +1,49 @@ +; RUN: split-file %s %t +; RUN: not llvm-as < %t/masked-store.ll 2>&1 | FileCheck %s --check-prefix=MASKED-STORE +; RUN: not llvm-as < %t/masked-store-zero.ll 2>&1 | FileCheck %s --check-prefix=MASKED-STORE-ZERO +; RUN: not llvm-as < %t/masked-load.ll 2>&1 | FileCheck %s --check-prefix=MASKED-LOAD +; RUN: not llvm-as < %t/masked-load-zero.ll 2>&1 | FileCheck %s --check-prefix=MASKED-LOAD-ZERO +; RUN: not llvm-as < %t/masked-scatter.ll 2>&1 | FileCheck %s --check-prefix=MASKED-SCATTER +; RUN: not llvm-as < %t/masked-gather.ll 2>&1 | FileCheck %s --check-prefix=MASKED-GATHER + +;--- masked-store.ll +; MASKED-STORE: LLVM ERROR: Invalid alignment argument +define void @masked_store(ptr %ptr, <2 x i1> %mask, <2 x double> %val) { + call void @llvm.masked.store.v2f64.p0(<2 x double> %val, ptr %ptr, i32 3, <2 x i1> %mask) + ret void +} + +;--- masked-store-zero.ll +; MASKED-STORE-ZERO: LLVM ERROR: Invalid zero alignment argument +define void @masked_store_zero(ptr %ptr, <2 x i1> %mask, <2 x double> %val) { + call void @llvm.masked.store.v2f64.p0(<2 x double> %val, ptr %ptr, i32 0, <2 x i1> %mask) + ret void +} + +;--- masked-load.ll +; MASKED-LOAD: LLVM ERROR: Invalid alignment argument +define void @masked_load(ptr %ptr, <2 x i1> %mask, <2 x double> %val) { + call <2 x double> @llvm.masked.load.v2f64.p0(ptr %ptr, i32 3, <2 x i1> %mask, <2 x double> %val) + ret void +} + +;--- masked-load-zero.ll +; MASKED-LOAD-ZERO: LLVM ERROR: Invalid zero alignment argument +define void @masked_load_zero(ptr %ptr, <2 x i1> %mask, <2 x double> %val) { + call <2 x double> @llvm.masked.load.v2f64.p0(ptr %ptr, i32 0, <2 x i1> %mask, <2 x double> %val) + ret void +} + +;--- masked-scatter.ll +; MASKED-SCATTER: LLVM ERROR: Invalid alignment argument +define void @masked_scatter(<2 x ptr> %ptr, <2 x i1> %mask, <2 x double> %val) { + call void @llvm.masked.scatter.v2f64.p0(<2 x double> %val, <2 x ptr> %ptr, i32 3, <2 x i1> %mask) + ret void +} + +;--- masked-gather.ll +; MASKED-GATHER: LLVM ERROR: Invalid alignment argument +define void @masked_gather(<2 x ptr> %ptr, <2 x i1> %mask, <2 x double> %val) { + call <2 x double> @llvm.masked.gather.v2f64.p0(<2 x ptr> %ptr, i32 3, <2 x i1> %mask, <2 x double> %val) + ret void +} diff --git a/llvm/test/Bitcode/thinlto-deadstrip-flag.ll b/llvm/test/Bitcode/thinlto-deadstrip-flag.ll deleted file mode 100644 index 00c0131..0000000 --- a/llvm/test/Bitcode/thinlto-deadstrip-flag.ll +++ /dev/null @@ -1,20 +0,0 @@ -; REQUIRES: x86-registered-target -; RUN: opt -module-summary %s -o %t.o - -; Ensure dead stripping performed flag is set on distributed index -; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \ -; RUN: -r %t.o,glob,plx -; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=WITHDEAD -; WITHDEAD: <FLAGS op0=97/> - -; Ensure dead stripping performed flag is not set on distributed index -; when option used to disable dead stripping computation. -; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \ -; RUN: -r %t.o,glob,plx -compute-dead=false -; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=NODEAD -; NODEAD: <FLAGS op0=96/> - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -@glob = global i32 0 diff --git a/llvm/test/Bitcode/thinlto-index-flags.ll b/llvm/test/Bitcode/thinlto-index-flags.ll new file mode 100644 index 0000000..e957ce6 --- /dev/null +++ b/llvm/test/Bitcode/thinlto-index-flags.ll @@ -0,0 +1,39 @@ +; REQUIRES: x86-registered-target +; RUN: opt -module-summary %s -o %t.o + +;; By default, the indexing step should perform and set the appropriate index +;; flags for dead stripping, attribute propagation, DSO local propagation, +;; and internalization/promotion. +; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \ +; RUN: -r %t.o,glob,plx +; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=ALL +;; The flag value should be 0x461 aka 1121: +;; 0x1: Dead stripping +;; 0x20: Attribute propagation +;; 0x40: DSO local propagation +;; 0x400: Internalization/promotion +; ALL: <FLAGS op0=1121/> + +;; Ensure dead stripping performed flag is not set on distributed index +;; when option used to disable dead stripping computation. +; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \ +; RUN: -r %t.o,glob,plx -compute-dead=false +; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=NODEAD +;; Flag should be 0x460 aka 1120. +; NODEAD: <FLAGS op0=1120/> + +;; Disabling attribute propagation should disable that as well as DSO local +;; propagation. +; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \ +; RUN: -r %t.o,glob,plx -propagate-attrs=false +; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=NOPROP +;; Flag should be 0x401 aka 1025. +; NOPROP: <FLAGS op0=1025/> + +;; Note there isn't currently a way to disable internalization+promotion, which +;; are performed together. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@glob = global i32 0 diff --git a/llvm/test/Bitcode/upgrade-branch-protection.ll b/llvm/test/Bitcode/upgrade-branch-protection.ll index 1b33e39..6f60ba5 100644 --- a/llvm/test/Bitcode/upgrade-branch-protection.ll +++ b/llvm/test/Bitcode/upgrade-branch-protection.ll @@ -1,8 +1,11 @@ -;; Test that module flags "branch-target-enforcement" and "sign-return-address" can be upgraded to -;; are upgraded from Error to Min. +;; Test that module flags "branch-target-enforcement" and "sign-return-address" +;; can be upgraded to are upgraded from Error to Min and the value is changed 2 +;; as the module is converted to the semantic. ; RUN: llvm-as %s -o - | llvm-dis - | FileCheck %s +target triple = "aarch64-unknown-linux-gnu" + !llvm.module.flags = !{!0, !1, !2, !3} !0 = !{i32 1, !"branch-target-enforcement", i32 1} @@ -10,7 +13,7 @@ !2 = !{i32 1, !"sign-return-address-all", i32 1} !3 = !{i32 1, !"sign-return-address-with-bkey", i32 1} -;CHECK: !0 = !{i32 8, !"branch-target-enforcement", i32 1} -;CHECK: !1 = !{i32 8, !"sign-return-address", i32 1} -;CHECK: !2 = !{i32 8, !"sign-return-address-all", i32 1} -;CHECK: !3 = !{i32 8, !"sign-return-address-with-bkey", i32 1}
\ No newline at end of file +;CHECK: !0 = !{i32 8, !"branch-target-enforcement", i32 2} +;CHECK: !1 = !{i32 8, !"sign-return-address", i32 2} +;CHECK: !2 = !{i32 8, !"sign-return-address-all", i32 2} +;CHECK: !3 = !{i32 8, !"sign-return-address-with-bkey", i32 2} diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memcpy.mir b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memcpy.mir index 97a0417..b040ff2 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memcpy.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memcpy.mir @@ -56,7 +56,7 @@ } - attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2" "use-soft-float"="false" } attributes #1 = { argmemonly nounwind } attributes #2 = { optsize } attributes #3 = { minsize } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memmove.mir b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memmove.mir index fc4fbac..f24aeae 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memmove.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memmove.mir @@ -47,7 +47,7 @@ ret void } - attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2" "use-soft-float"="false" } attributes #1 = { argmemonly nounwind } ... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir index b06cadf..e4d2ca3 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir @@ -50,7 +50,7 @@ declare void @llvm.stackprotector(ptr, ptr) #2 - attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2" "use-soft-float"="false" } attributes #1 = { argmemonly nounwind } ... diff --git a/llvm/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll b/llvm/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll index 0c1776e..6e3682a 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll @@ -37,7 +37,7 @@ for.body: ; preds = %for.body, %entry ; Function Attrs: nounwind readnone declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1 -attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } attributes #1 = { nounwind readnone } !llvm.dbg.cu = !{!0} diff --git a/llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll b/llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll index f2ed57e..353e818 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll @@ -325,7 +325,7 @@ entry: declare void @hhh(double, double) -attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "use-soft-float"="false" } +attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "use-soft-float"="false" } attributes #2 = { nounwind } diff --git a/llvm/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll b/llvm/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll index 7e97116..8da0e11 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll @@ -694,8 +694,8 @@ bb1: ; CHECK: .[[LABEL]]: ; CHECK: ret -attributes #0 = { "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } +attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } !1 = !{!2, !2, i64 0} !2 = !{!"int", !3, i64 0} diff --git a/llvm/test/CodeGen/AArch64/aarch64-fix-cortex-a53-835769.ll b/llvm/test/CodeGen/AArch64/aarch64-fix-cortex-a53-835769.ll index 296435a..937bfe4 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-fix-cortex-a53-835769.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-fix-cortex-a53-835769.ll @@ -519,8 +519,8 @@ while.cond: br label %while.cond } -attributes #0 = { nounwind readonly "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind readonly "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } +attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } attributes #2 = { nounwind } diff --git a/llvm/test/CodeGen/AArch64/aarch64-mov-debug-locs.mir b/llvm/test/CodeGen/AArch64/aarch64-mov-debug-locs.mir index 45fa2be5..c05d661 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-mov-debug-locs.mir +++ b/llvm/test/CodeGen/AArch64/aarch64-mov-debug-locs.mir @@ -79,8 +79,8 @@ ; Function Attrs: nounwind declare void @llvm.stackprotector(ptr, ptr) #3 - attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" } + attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" } attributes #2 = { nounwind readnone speculatable } attributes #3 = { nounwind } diff --git a/llvm/test/CodeGen/AArch64/arm64-detect-vec-redux.ll b/llvm/test/CodeGen/AArch64/arm64-detect-vec-redux.ll index 4e86f52..071344d 100644 --- a/llvm/test/CodeGen/AArch64/arm64-detect-vec-redux.ll +++ b/llvm/test/CodeGen/AArch64/arm64-detect-vec-redux.ll @@ -47,6 +47,6 @@ declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>) #1 ; Function Attrs: nounwind readnone declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>) #1 -attributes #0 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" } attributes #1 = { nounwind readnone } attributes #2 = { nounwind } diff --git a/llvm/test/CodeGen/AArch64/arm64-fma-combine-with-fpfusion.ll b/llvm/test/CodeGen/AArch64/arm64-fma-combine-with-fpfusion.ll index 9b3d539..0ddcdcc 100644 --- a/llvm/test/CodeGen/AArch64/arm64-fma-combine-with-fpfusion.ll +++ b/llvm/test/CodeGen/AArch64/arm64-fma-combine-with-fpfusion.ll @@ -8,5 +8,5 @@ define float @mul_add(float %a, float %b, float %c) local_unnamed_addr #0 { ret float %add } -attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" } diff --git a/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll b/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll index e17a0a9..54f752e 100644 --- a/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll +++ b/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -O=3 -mtriple=arm64-apple-ios -mcpu=cyclone -mattr=+fullfp16 -enable-unsafe-fp-math -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -O=3 -mtriple=arm64-apple-ios -mcpu=cyclone -mattr=+fullfp16 -verify-machineinstrs | FileCheck %s define void @foo_2d(ptr %src) { ; CHECK-LABEL: %entry diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll index d2ce7e6..41f57bf 100644 --- a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll +++ b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll @@ -84,7 +84,7 @@ bb3: ; preds = %bb3, %bb ; Function Attrs: nounwind readnone declare i64 @llvm.objectsize.i64.p0(ptr, i1) #1 -attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } attributes #1 = { nounwind readnone } !1 = !{!2, !2, i64 0} diff --git a/llvm/test/CodeGen/AArch64/arm64-ld1.ll b/llvm/test/CodeGen/AArch64/arm64-ld1.ll index 0b22fa4..c2b2c1e 100644 --- a/llvm/test/CodeGen/AArch64/arm64-ld1.ll +++ b/llvm/test/CodeGen/AArch64/arm64-ld1.ll @@ -1654,24 +1654,14 @@ define %struct.__neon_float64x2x4_t @ld1_x4_v2f64(ptr %addr) { } define <8 x i8> @dup_ld1_from_stack(ptr %__ret) { -; CHECK-SD-LABEL: dup_ld1_from_stack: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: sub sp, sp, #16 -; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 -; CHECK-SD-NEXT: add x8, sp, #15 -; CHECK-SD-NEXT: ld1r.8b { v0 }, [x8] -; CHECK-SD-NEXT: add sp, sp, #16 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: dup_ld1_from_stack: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 -; CHECK-GI-NEXT: .cfi_offset w29, -16 -; CHECK-GI-NEXT: add x8, sp, #15 -; CHECK-GI-NEXT: ld1r.8b { v0 }, [x8] -; CHECK-GI-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -; CHECK-GI-NEXT: ret +; CHECK-LABEL: dup_ld1_from_stack: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: add x8, sp, #15 +; CHECK-NEXT: ld1r.8b { v0 }, [x8] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret entry: %item = alloca i8, align 1 %0 = load i8, ptr %item, align 1 diff --git a/llvm/test/CodeGen/AArch64/arm64-misched-basic-A53.ll b/llvm/test/CodeGen/AArch64/arm64-misched-basic-A53.ll index 4cdc6cc..c6cf240 100644 --- a/llvm/test/CodeGen/AArch64/arm64-misched-basic-A53.ll +++ b/llvm/test/CodeGen/AArch64/arm64-misched-basic-A53.ll @@ -107,7 +107,7 @@ define <4 x float> @neon4xfloat(<4 x float> %A, <4 x float> %B) { ; Function Attrs: nounwind declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1) #1 -attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AArch64/arm64-misched-basic-A57.ll b/llvm/test/CodeGen/AArch64/arm64-misched-basic-A57.ll index 82b34ef..bb1a6b0 100644 --- a/llvm/test/CodeGen/AArch64/arm64-misched-basic-A57.ll +++ b/llvm/test/CodeGen/AArch64/arm64-misched-basic-A57.ll @@ -108,5 +108,5 @@ for.end: ; preds = %for.cond ; Function Attrs: nounwind declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1) #1 -attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AArch64/arm64-rounding.ll b/llvm/test/CodeGen/AArch64/arm64-rounding.ll index d487aab..3ce35bf 100644 --- a/llvm/test/CodeGen/AArch64/arm64-rounding.ll +++ b/llvm/test/CodeGen/AArch64/arm64-rounding.ll @@ -201,4 +201,4 @@ entry: } attributes #0 = { nounwind } -attributes #1 = { nounwind "unsafe-fp-math"="true" } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AArch64/arm64-storebytesmerge.ll b/llvm/test/CodeGen/AArch64/arm64-storebytesmerge.ll index db65fdd..1486b3a 100644 --- a/llvm/test/CodeGen/AArch64/arm64-storebytesmerge.ll +++ b/llvm/test/CodeGen/AArch64/arm64-storebytesmerge.ll @@ -36,6 +36,6 @@ for.end705.i: ; preds = %for.body453.i declare void @f() local_unnamed_addr #1 -attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a57" "target-features"="+crc,+crypto,+fp-armv8,+neon" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a57" "target-features"="+crc,+crypto,+fp-armv8,+neon" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a57" "target-features"="+crc,+crypto,+fp-armv8,+neon" "use-soft-float"="false" } +attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a57" "target-features"="+crc,+crypto,+fp-armv8,+neon" "use-soft-float"="false" } attributes #2 = { nounwind } diff --git a/llvm/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll index fc59350..593d629 100644 --- a/llvm/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll +++ b/llvm/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll @@ -18,7 +18,7 @@ entry: ret i32 %1 } -attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "use-soft-float"="false" } !llvm.ident = !{!0} diff --git a/llvm/test/CodeGen/AArch64/bti-branch-relaxation.ll b/llvm/test/CodeGen/AArch64/bti-branch-relaxation.ll index 2e3b99f..c4bf7d2 100644 --- a/llvm/test/CodeGen/AArch64/bti-branch-relaxation.ll +++ b/llvm/test/CodeGen/AArch64/bti-branch-relaxation.ll @@ -61,4 +61,4 @@ declare dso_local void @e(...) local_unnamed_addr #0 declare dso_local i64 @llvm.aarch64.space(i32, i64) local_unnamed_addr #0 -attributes #0 = { nounwind "branch-target-enforcement" "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon,+v8.5a" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind "branch-target-enforcement" "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon,+v8.5a" "use-soft-float"="false" } diff --git a/llvm/test/CodeGen/AArch64/consthoist-gep.ll b/llvm/test/CodeGen/AArch64/consthoist-gep.ll index 031ee35..7d2aaec 100644 --- a/llvm/test/CodeGen/AArch64/consthoist-gep.ll +++ b/llvm/test/CodeGen/AArch64/consthoist-gep.ll @@ -108,7 +108,7 @@ bb19: ; preds = %bb3, %bb ret void } -attributes #0 = { norecurse nounwind optsize ssp "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { norecurse nounwind optsize ssp "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } !llvm.module.flags = !{!0, !1} !llvm.ident = !{!2} diff --git a/llvm/test/CodeGen/AArch64/csel-zero-float.ll b/llvm/test/CodeGen/AArch64/csel-zero-float.ll index 6edde13..56a33cc 100644 --- a/llvm/test/CodeGen/AArch64/csel-zero-float.ll +++ b/llvm/test/CodeGen/AArch64/csel-zero-float.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=aarch64-none-linux-gnu -enable-unsafe-fp-math < %s +; RUN: llc -mtriple=aarch64-none-linux-gnu < %s ; There is no invocation to FileCheck as this ; caused a crash in "Post-RA pseudo instruction expansion" diff --git a/llvm/test/CodeGen/AArch64/dag-combine-invaraints.ll b/llvm/test/CodeGen/AArch64/dag-combine-invaraints.ll index 61df396..e561481 100644 --- a/llvm/test/CodeGen/AArch64/dag-combine-invaraints.ll +++ b/llvm/test/CodeGen/AArch64/dag-combine-invaraints.ll @@ -32,5 +32,5 @@ main_: declare i32 @printf(ptr, ...) #1 -attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } +attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } diff --git a/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll b/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll index 1a83930..9193025 100644 --- a/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll +++ b/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll @@ -2,8 +2,8 @@ ; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s ; load zero-extended i32, bitcast to f64 -define double @_Z9load_u64_from_u32_testPj(ptr %n){ -; CHECK-LABEL: _Z9load_u64_from_u32_testPj: +define double @load_u64_from_u32(ptr %n){ +; CHECK-LABEL: load_u64_from_u32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr s0, [x0] ; CHECK-NEXT: ret @@ -15,8 +15,8 @@ entry: } ; load zero-extended i16, bitcast to f64 -define double @_Z9load_u64_from_u16_testPj(ptr %n){ -; CHECK-LABEL: _Z9load_u64_from_u16_testPj: +define double @load_u64_from_u16(ptr %n){ +; CHECK-LABEL: load_u64_from_u16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr h0, [x0] ; CHECK-NEXT: ret @@ -28,8 +28,8 @@ entry: } ; load zero-extended i8, bitcast to f64 -define double @_Z16load_u64_from_u8Ph(ptr %n){ -; CHECK-LABEL: _Z16load_u64_from_u8Ph: +define double @load_u64_from_u8(ptr %n){ +; CHECK-LABEL: load_u64_from_u8: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr b0, [x0] ; CHECK-NEXT: ret @@ -41,8 +41,8 @@ entry: } ; load zero-extended i16, bitcast to f32 -define float @_Z17load_u32_from_u16Pt(ptr %n){ -; CHECK-LABEL: _Z17load_u32_from_u16Pt: +define float @load_u32_from_u16(ptr %n){ +; CHECK-LABEL: load_u32_from_u16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr h0, [x0] ; CHECK-NEXT: ret @@ -54,8 +54,8 @@ entry: } ; load zero-extended i8, bitcast to f32 -define float @_Z16load_u32_from_u8Ph(ptr %n){ -; CHECK-LABEL: _Z16load_u32_from_u8Ph: +define float @load_u32_from_u8(ptr %n){ +; CHECK-LABEL: load_u32_from_u8: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr b0, [x0] ; CHECK-NEXT: ret @@ -67,8 +67,8 @@ entry: } ; load zero-extended i8, bitcast to f16 -define half @_Z16load_u16_from_u8Ph(ptr %n){ -; CHECK-LABEL: _Z16load_u16_from_u8Ph: +define half @load_u16_from_u8(ptr %n){ +; CHECK-LABEL: load_u16_from_u8: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr b0, [x0] ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0 @@ -80,3 +80,504 @@ entry: ret half %1 } + +define double @load_u64_from_u32_off1(ptr %n){ +; CHECK-LABEL: load_u64_from_u32_off1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldur w8, [x0, #1] +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 1 + %0 = load i32, ptr %p, align 4 + %conv = zext i32 %0 to i64 + %1 = bitcast i64 %conv to double + ret double %1 +} + +define double @load_u64_from_u16_off1(ptr %n){ +; CHECK-LABEL: load_u64_from_u16_off1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldurh w8, [x0, #1] +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 1 + %0 = load i16, ptr %p, align 2 + %conv = zext i16 %0 to i64 + %1 = bitcast i64 %conv to double + ret double %1 +} + +define double @load_u64_from_u8_off1(ptr %n){ +; CHECK-LABEL: load_u64_from_u8_off1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldrb w8, [x0, #1] +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 1 + %0 = load i8, ptr %p, align 1 + %conv = zext i8 %0 to i64 + %1 = bitcast i64 %conv to double + ret double %1 +} + +define float @load_u32_from_u16_off1(ptr %n){ +; CHECK-LABEL: load_u32_from_u16_off1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldurh w8, [x0, #1] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 1 + %0 = load i16, ptr %p, align 2 + %conv = zext i16 %0 to i32 + %1 = bitcast i32 %conv to float + ret float %1 +} + +define float @load_u32_from_u8_off1(ptr %n){ +; CHECK-LABEL: load_u32_from_u8_off1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldrb w8, [x0, #1] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 1 + %0 = load i8, ptr %p, align 1 + %conv = zext i8 %0 to i32 + %1 = bitcast i32 %conv to float + ret float %1 +} + +define half @load_u16_from_u8_off1(ptr %n){ +; CHECK-LABEL: load_u16_from_u8_off1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldrb w8, [x0, #1] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 1 + %0 = load i8, ptr %p, align 1 + %conv = zext i8 %0 to i16 + %1 = bitcast i16 %conv to half + ret half %1 +} + + + +define double @load_u64_from_u32_off2(ptr %n){ +; CHECK-LABEL: load_u64_from_u32_off2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldur w8, [x0, #2] +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 2 + %0 = load i32, ptr %p, align 4 + %conv = zext i32 %0 to i64 + %1 = bitcast i64 %conv to double + ret double %1 +} + +define double @load_u64_from_u16_off2(ptr %n){ +; CHECK-LABEL: load_u64_from_u16_off2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldrh w8, [x0, #2] +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 2 + %0 = load i16, ptr %p, align 2 + %conv = zext i16 %0 to i64 + %1 = bitcast i64 %conv to double + ret double %1 +} + +define double @load_u64_from_u8_off2(ptr %n){ +; CHECK-LABEL: load_u64_from_u8_off2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldrb w8, [x0, #2] +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 2 + %0 = load i8, ptr %p, align 1 + %conv = zext i8 %0 to i64 + %1 = bitcast i64 %conv to double + ret double %1 +} + +define float @load_u32_from_u16_off2(ptr %n){ +; CHECK-LABEL: load_u32_from_u16_off2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr h0, [x0, #2] +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 2 + %0 = load i16, ptr %p, align 2 + %conv = zext i16 %0 to i32 + %1 = bitcast i32 %conv to float + ret float %1 +} + +define float @load_u32_from_u8_off2(ptr %n){ +; CHECK-LABEL: load_u32_from_u8_off2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr b0, [x0, #1] +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 2 + %0 = load i8, ptr %p, align 1 + %conv = zext i8 %0 to i32 + %1 = bitcast i32 %conv to float + ret float %1 +} + +define half @load_u16_from_u8_off2(ptr %n){ +; CHECK-LABEL: load_u16_from_u8_off2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr b0, [x0, #1] +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 2 + %0 = load i8, ptr %p, align 1 + %conv = zext i8 %0 to i16 + %1 = bitcast i16 %conv to half + ret half %1 +} + + + +define double @load_u64_from_u32_off255(ptr %n){ +; CHECK-LABEL: load_u64_from_u32_off255: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldur w8, [x0, #255] +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 255 + %0 = load i32, ptr %p, align 4 + %conv = zext i32 %0 to i64 + %1 = bitcast i64 %conv to double + ret double %1 +} + +define double @load_u64_from_u16_off255(ptr %n){ +; CHECK-LABEL: load_u64_from_u16_off255: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldurh w8, [x0, #255] +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 255 + %0 = load i16, ptr %p, align 2 + %conv = zext i16 %0 to i64 + %1 = bitcast i64 %conv to double + ret double %1 +} + +define double @load_u64_from_u8_off255(ptr %n){ +; CHECK-LABEL: load_u64_from_u8_off255: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldrb w8, [x0, #255] +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 255 + %0 = load i8, ptr %p, align 1 + %conv = zext i8 %0 to i64 + %1 = bitcast i64 %conv to double + ret double %1 +} + +define float @load_u32_from_u16_off255(ptr %n){ +; CHECK-LABEL: load_u32_from_u16_off255: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldurh w8, [x0, #255] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 255 + %0 = load i16, ptr %p, align 2 + %conv = zext i16 %0 to i32 + %1 = bitcast i32 %conv to float + ret float %1 +} + +define float @load_u32_from_u8_off255(ptr %n){ +; CHECK-LABEL: load_u32_from_u8_off255: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldrb w8, [x0, #255] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 255 + %0 = load i8, ptr %p, align 1 + %conv = zext i8 %0 to i32 + %1 = bitcast i32 %conv to float + ret float %1 +} + +define half @load_u16_from_u8_off255(ptr %n){ +; CHECK-LABEL: load_u16_from_u8_off255: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldrb w8, [x0, #255] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 255 + %0 = load i8, ptr %p, align 1 + %conv = zext i8 %0 to i16 + %1 = bitcast i16 %conv to half + ret half %1 +} + + +define double @load_u64_from_u32_off256(ptr %n){ +; CHECK-LABEL: load_u64_from_u32_off256: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr s0, [x0, #256] +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 256 + %0 = load i32, ptr %p, align 4 + %conv = zext i32 %0 to i64 + %1 = bitcast i64 %conv to double + ret double %1 +} + +define double @load_u64_from_u16_off256(ptr %n){ +; CHECK-LABEL: load_u64_from_u16_off256: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr h0, [x0, #128] +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 256 + %0 = load i16, ptr %p, align 2 + %conv = zext i16 %0 to i64 + %1 = bitcast i64 %conv to double + ret double %1 +} + +define double @load_u64_from_u8_off256(ptr %n){ +; CHECK-LABEL: load_u64_from_u8_off256: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr b0, [x0, #64] +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 256 + %0 = load i8, ptr %p, align 1 + %conv = zext i8 %0 to i64 + %1 = bitcast i64 %conv to double + ret double %1 +} + +define float @load_u32_from_u16_off256(ptr %n){ +; CHECK-LABEL: load_u32_from_u16_off256: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr h0, [x0, #256] +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 256 + %0 = load i16, ptr %p, align 2 + %conv = zext i16 %0 to i32 + %1 = bitcast i32 %conv to float + ret float %1 +} + +define float @load_u32_from_u8_off256(ptr %n){ +; CHECK-LABEL: load_u32_from_u8_off256: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr b0, [x0, #128] +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 256 + %0 = load i8, ptr %p, align 1 + %conv = zext i8 %0 to i32 + %1 = bitcast i32 %conv to float + ret float %1 +} + +define half @load_u16_from_u8_off256(ptr %n){ +; CHECK-LABEL: load_u16_from_u8_off256: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr b0, [x0, #128] +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 256 + %0 = load i8, ptr %p, align 1 + %conv = zext i8 %0 to i16 + %1 = bitcast i16 %conv to half + ret half %1 +} + + + +define double @load_u64_from_u32_offn(ptr %n){ +; CHECK-LABEL: load_u64_from_u32_offn: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr s0, [x0, #16380] +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 16380 + %0 = load i32, ptr %p, align 4 + %conv = zext i32 %0 to i64 + %1 = bitcast i64 %conv to double + ret double %1 +} + +define double @load_u64_from_u16_offn(ptr %n){ +; CHECK-LABEL: load_u64_from_u16_offn: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #8190 // =0x1ffe +; CHECK-NEXT: ldr h0, [x0, x8] +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 8190 + %0 = load i16, ptr %p, align 2 + %conv = zext i16 %0 to i64 + %1 = bitcast i64 %conv to double + ret double %1 +} + +define double @load_u64_from_u8_offn(ptr %n){ +; CHECK-LABEL: load_u64_from_u8_offn: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr b0, [x0, #4095] +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 4095 + %0 = load i8, ptr %p, align 1 + %conv = zext i8 %0 to i64 + %1 = bitcast i64 %conv to double + ret double %1 +} + +define float @load_u32_from_u16_offn(ptr %n){ +; CHECK-LABEL: load_u32_from_u16_offn: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr h0, [x0, #8190] +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 8190 + %0 = load i16, ptr %p, align 2 + %conv = zext i16 %0 to i32 + %1 = bitcast i32 %conv to float + ret float %1 +} + +define float @load_u32_from_u8_offn(ptr %n){ +; CHECK-LABEL: load_u32_from_u8_offn: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr b0, [x0, #4095] +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 4095 + %0 = load i8, ptr %p, align 1 + %conv = zext i8 %0 to i32 + %1 = bitcast i32 %conv to float + ret float %1 +} + +define half @load_u16_from_u8_offn(ptr %n){ +; CHECK-LABEL: load_u16_from_u8_offn: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr b0, [x0, #4095] +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 4095 + %0 = load i8, ptr %p, align 1 + %conv = zext i8 %0 to i16 + %1 = bitcast i16 %conv to half + ret half %1 +} + + +define double @load_u64_from_u32_offnp1(ptr %n){ +; CHECK-LABEL: load_u64_from_u32_offnp1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add x8, x0, #4, lsl #12 // =16384 +; CHECK-NEXT: ldr s0, [x8] +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 16384 + %0 = load i32, ptr %p, align 4 + %conv = zext i32 %0 to i64 + %1 = bitcast i64 %conv to double + ret double %1 +} + +define double @load_u64_from_u16_offnp1(ptr %n){ +; CHECK-LABEL: load_u64_from_u16_offnp1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr h0, [x0, #4096] +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 8192 + %0 = load i16, ptr %p, align 2 + %conv = zext i16 %0 to i64 + %1 = bitcast i64 %conv to double + ret double %1 +} + +define double @load_u64_from_u8_offnp1(ptr %n){ +; CHECK-LABEL: load_u64_from_u8_offnp1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr b0, [x0, #1024] +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 4096 + %0 = load i8, ptr %p, align 1 + %conv = zext i8 %0 to i64 + %1 = bitcast i64 %conv to double + ret double %1 +} + +define float @load_u32_from_u16_offnp1(ptr %n){ +; CHECK-LABEL: load_u32_from_u16_offnp1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add x8, x0, #2, lsl #12 // =8192 +; CHECK-NEXT: ldr h0, [x8] +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 8192 + %0 = load i16, ptr %p, align 2 + %conv = zext i16 %0 to i32 + %1 = bitcast i32 %conv to float + ret float %1 +} + +define float @load_u32_from_u8_offnp1(ptr %n){ +; CHECK-LABEL: load_u32_from_u8_offnp1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr b0, [x0, #2048] +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 4096 + %0 = load i8, ptr %p, align 1 + %conv = zext i8 %0 to i32 + %1 = bitcast i32 %conv to float + ret float %1 +} + +define half @load_u16_from_u8_offnp1(ptr %n){ +; CHECK-LABEL: load_u16_from_u8_offnp1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr b0, [x0, #2048] +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 4096 + %0 = load i8, ptr %p, align 1 + %conv = zext i8 %0 to i16 + %1 = bitcast i16 %conv to half + ret half %1 +} + diff --git a/llvm/test/CodeGen/AArch64/partial-pipeline-execution.ll b/llvm/test/CodeGen/AArch64/partial-pipeline-execution.ll index c2ef2fa..00a8c30 100644 --- a/llvm/test/CodeGen/AArch64/partial-pipeline-execution.ll +++ b/llvm/test/CodeGen/AArch64/partial-pipeline-execution.ll @@ -74,7 +74,7 @@ for.body: ; preds = %for.body.preheader, br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !10 } -attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" } !llvm.module.flags = !{!0} !llvm.ident = !{!1} diff --git a/llvm/test/CodeGen/AArch64/recp-fastmath.ll b/llvm/test/CodeGen/AArch64/recp-fastmath.ll index 9f00621..fa1da33 100644 --- a/llvm/test/CodeGen/AArch64/recp-fastmath.ll +++ b/llvm/test/CodeGen/AArch64/recp-fastmath.ll @@ -164,5 +164,5 @@ define <4 x double> @d4recp1(<4 x double> %x) #1 { ; CHECK-NOT: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} } -attributes #0 = { nounwind "unsafe-fp-math"="true" } -attributes #1 = { nounwind "unsafe-fp-math"="true" "reciprocal-estimates"="div,vec-div" } +attributes #0 = { nounwind } +attributes #1 = { nounwind "reciprocal-estimates"="div,vec-div" } diff --git a/llvm/test/CodeGen/AArch64/shrink-wrap-const-pool-access.mir b/llvm/test/CodeGen/AArch64/shrink-wrap-const-pool-access.mir new file mode 100644 index 0000000..6f33a75 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/shrink-wrap-const-pool-access.mir @@ -0,0 +1,76 @@ +# RUN: llc -mtriple=aarch64 -simplify-mir -run-pass=shrink-wrap -o - %s | FileCheck %s +--- | + declare double @foo() + + define double @shrink_wrap_load_from_const_pool(double %q) { + entry: + %0 = fcmp oeq double %q, 3.125500e+02 + br i1 %0, label %common.ret, label %if.else + + common.ret: ; preds = %if.else, %entry, %exit1 + %common.ret.op = phi double [ %3, %exit1 ], [ 0.000000e+00, %entry ], [ 0.000000e+00, %if.else ] + ret double %common.ret.op + + if.else: ; preds = %entry + %1 = call double @foo() + %2 = fcmp oeq double %1, 0.000000e+00 + br i1 %2, label %exit1, label %common.ret + + exit1: ; preds = %if.else + %3 = call double @foo() + br label %common.ret + } +... +# Following code has a load from constant pool. Accessing constant pool +# must not be considered as a stack access and hence, shrink wrapping must +# happen. +# CHECK-LABEL:name: shrink_wrap_load_from_const_pool +# CHECK: savePoint: +# CHECK: - point: '%bb.3' +# CHECK: restorePoint: +# CHECK: - point: '%bb.5' +--- +name: shrink_wrap_load_from_const_pool +tracksRegLiveness: true +constants: + - id: 0 + value: 'double 3.125500e+02' + alignment: 8 +body: | + bb.0.entry: + successors: %bb.4(0x50000000), %bb.2(0x30000000) + liveins: $d0 + + renamable $d1 = COPY $d0 + renamable $x8 = ADRP target-flags(aarch64-page) %const.0 + renamable $d2 = LDRDui killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) %const.0 :: (load (s64) from constant-pool) + renamable $d0 = FMOVD0 + nofpexcept FCMPDrr killed renamable $d1, killed renamable $d2, implicit-def $nzcv, implicit $fpcr + Bcc 1, %bb.2, implicit killed $nzcv + + bb.4: + liveins: $d0 + + bb.1.common.ret: + liveins: $d0 + + RET_ReallyLR implicit $d0 + + bb.2.if.else: + successors: %bb.3(0x50000000), %bb.1(0x30000000) + + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + BL @foo, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $d0 + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + renamable $d1 = COPY $d0 + renamable $d0 = FMOVD0 + nofpexcept FCMPDri killed renamable $d1, implicit-def $nzcv, implicit $fpcr + Bcc 1, %bb.1, implicit killed $nzcv + B %bb.3 + + bb.3.exit1: + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + BL @foo, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $d0 + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + B %bb.1 +... diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-ex-1.ll b/llvm/test/CodeGen/AArch64/stack-tagging-ex-1.ll index 66ac04e..22abb8c 100644 --- a/llvm/test/CodeGen/AArch64/stack-tagging-ex-1.ll +++ b/llvm/test/CodeGen/AArch64/stack-tagging-ex-1.ll @@ -64,6 +64,6 @@ declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1 ; Function Attrs: argmemonly nounwind willreturn declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1 -attributes #0 = { sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "use-soft-float"="false" } attributes #1 = { argmemonly nounwind willreturn } attributes #2 = { nounwind } diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-ex-2.ll b/llvm/test/CodeGen/AArch64/stack-tagging-ex-2.ll index e5725bc..d689a76 100644 --- a/llvm/test/CodeGen/AArch64/stack-tagging-ex-2.ll +++ b/llvm/test/CodeGen/AArch64/stack-tagging-ex-2.ll @@ -158,10 +158,10 @@ eh.resume: ; preds = %lpad.body resume { ptr, i32 } %eh.lpad-body } -attributes #0 = { noreturn sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { noreturn sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "use-soft-float"="false" } attributes #1 = { argmemonly nounwind willreturn } attributes #2 = { nounwind readnone } -attributes #3 = { norecurse sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { norecurse sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "use-soft-float"="false" } attributes #4 = { nounwind } attributes #5 = { noreturn } diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll b/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll index 91adf82..7483622 100644 --- a/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll +++ b/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll @@ -77,6 +77,6 @@ declare void @llvm.lifetime.start.p0(ptr nocapture) #1 declare void @llvm.lifetime.end.p0(ptr nocapture) #1 -attributes #0 = { sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "use-soft-float"="false" } attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AArch64/stack_guard_remat.ll b/llvm/test/CodeGen/AArch64/stack_guard_remat.ll index 523eda61..e41d82c 100644 --- a/llvm/test/CodeGen/AArch64/stack_guard_remat.ll +++ b/llvm/test/CodeGen/AArch64/stack_guard_remat.ll @@ -54,7 +54,7 @@ declare void @foo3(ptr) ; Function Attrs: nounwind declare void @llvm.lifetime.end.p0(i64, ptr nocapture) -attributes #0 = { nounwind sspstrong "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind sspstrong "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } ;--- pic.ll !llvm.module.flags = !{!0} diff --git a/llvm/test/CodeGen/AArch64/svtcf-fmul-fdiv-combine.ll b/llvm/test/CodeGen/AArch64/svtcf-fmul-fdiv-combine.ll index f78fcea..b8dcd6f 100644 --- a/llvm/test/CodeGen/AArch64/svtcf-fmul-fdiv-combine.ll +++ b/llvm/test/CodeGen/AArch64/svtcf-fmul-fdiv-combine.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -mtriple aarch64-none-linux-gnu -enable-unsafe-fp-math -mattr=+fullfp16 < %s | FileCheck %s +; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+fullfp16 < %s | FileCheck %s define half @scvtf_f16_2(i32 %state) { ; CHECK-LABEL: scvtf_f16_2: diff --git a/llvm/test/CodeGen/AArch64/vector_merge_dep_check.ll b/llvm/test/CodeGen/AArch64/vector_merge_dep_check.ll index 623ea22..89b3b89 100644 --- a/llvm/test/CodeGen/AArch64/vector_merge_dep_check.ll +++ b/llvm/test/CodeGen/AArch64/vector_merge_dep_check.ll @@ -24,7 +24,7 @@ define void @fn(ptr %argA, ptr %argB, ptr %a) #0 align 2 { ; CHECK: ret -attributes #0 = { noinline norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-features"="+crc,+crypto,+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { noinline norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-features"="+crc,+crypto,+neon" "use-soft-float"="false" } !llvm.ident = !{!0} diff --git a/llvm/test/CodeGen/AArch64/wineh-frame5.mir b/llvm/test/CodeGen/AArch64/wineh-frame5.mir index 97c5c85..32580f4 100644 --- a/llvm/test/CodeGen/AArch64/wineh-frame5.mir +++ b/llvm/test/CodeGen/AArch64/wineh-frame5.mir @@ -64,9 +64,9 @@ ; Function Attrs: nounwind declare void @llvm.stackprotector(ptr, ptr) #3 - attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" } attributes #1 = { argmemonly nounwind } - attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" } attributes #3 = { nounwind } ... diff --git a/llvm/test/CodeGen/AArch64/wineh-frame6.mir b/llvm/test/CodeGen/AArch64/wineh-frame6.mir index 5ba7842..d76fae1 100644 --- a/llvm/test/CodeGen/AArch64/wineh-frame6.mir +++ b/llvm/test/CodeGen/AArch64/wineh-frame6.mir @@ -47,8 +47,8 @@ ; Function Attrs: nounwind declare void @llvm.stackprotector(ptr, ptr) #2 - attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" } + attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" } attributes #2 = { nounwind } ... diff --git a/llvm/test/CodeGen/AArch64/wineh-frame7.mir b/llvm/test/CodeGen/AArch64/wineh-frame7.mir index 1599098..d4e71d9 100644 --- a/llvm/test/CodeGen/AArch64/wineh-frame7.mir +++ b/llvm/test/CodeGen/AArch64/wineh-frame7.mir @@ -71,8 +71,8 @@ ; Function Attrs: nounwind declare void @llvm.stackprotector(ptr, ptr) #2 - attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" } + attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" } attributes #2 = { nounwind } ... diff --git a/llvm/test/CodeGen/AArch64/wineh-frame8.mir b/llvm/test/CodeGen/AArch64/wineh-frame8.mir index 9de99ac..56f92f2 100644 --- a/llvm/test/CodeGen/AArch64/wineh-frame8.mir +++ b/llvm/test/CodeGen/AArch64/wineh-frame8.mir @@ -29,7 +29,7 @@ ret i32 %add } - attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" } ... --- diff --git a/llvm/test/CodeGen/AArch64/wineh5.mir b/llvm/test/CodeGen/AArch64/wineh5.mir index efdd4b0..1c09b78 100644 --- a/llvm/test/CodeGen/AArch64/wineh5.mir +++ b/llvm/test/CodeGen/AArch64/wineh5.mir @@ -73,8 +73,8 @@ ; Function Attrs: nounwind declare void @llvm.stackprotector(ptr, ptr) #2 - attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" } + attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" } attributes #2 = { nounwind } !llvm.module.flags = !{!0} diff --git a/llvm/test/CodeGen/AArch64/wineh_shrinkwrap.mir b/llvm/test/CodeGen/AArch64/wineh_shrinkwrap.mir index 2f631c2..52d0dff 100644 --- a/llvm/test/CodeGen/AArch64/wineh_shrinkwrap.mir +++ b/llvm/test/CodeGen/AArch64/wineh_shrinkwrap.mir @@ -56,9 +56,9 @@ ; Function Attrs: nounwind declare void @llvm.stackprotector(ptr, ptr) #3 - attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" } attributes #1 = { argmemonly nounwind } - attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" } attributes #3 = { nounwind } !llvm.module.flags = !{!0} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir index d9ac9a7..de1bb47 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -enable-unsafe-fp-math -run-pass=amdgpu-prelegalizer-combiner %s -o - | FileCheck -check-prefix=GFX10 %s +# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-prelegalizer-combiner %s -o - | FileCheck -check-prefix=GFX10 %s # Test that we fold correct element from G_UNMERGE_VALUES into fma diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir index 52b1beb..91f2f6f1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -enable-unsafe-fp-math -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GFX10 -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -enable-unsafe-fp-math -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GFX11 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GFX10 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GFX11 --- name: fract_f64_neg diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll index 5171403..7714c03 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll @@ -140,7 +140,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) { ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cbranch_scc0 .LBB9_2 ; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 @@ -345,7 +344,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) { ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0 ; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1 ; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cbranch_scc0 .LBB17_2 ; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll index 7b01f13..7b81669 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll @@ -143,7 +143,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) { ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cbranch_scc0 .LBB9_2 ; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 @@ -348,7 +347,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) { ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0 ; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1 ; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cbranch_scc0 .LBB17_2 ; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 diff --git a/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll index b72eba8..8088c1b 100644 --- a/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll +++ b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll @@ -180,11 +180,7 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B ; CHECK-LABEL: s_add64_32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 s0, s0, s2 -; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0 ; CHECK-NEXT: s_addc_u32 s1, s1, s3 -; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0 ; CHECK-NEXT: s_addc_u32 s2, s4, 0 ; CHECK-NEXT: ; return to shader part epilog %sum64 = add i64 %val64A, %val64B @@ -199,14 +195,10 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) { ; CHECK-LABEL: s_uadd_v2i64: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s2, s6 -; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[8:9], 0 -; CHECK-NEXT: s_addc_u32 s8, s3, s7 +; CHECK-NEXT: s_add_u32 s6, s2, s6 +; CHECK-NEXT: s_addc_u32 s7, s3, s7 ; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 ; CHECK-NEXT: s_add_u32 s0, s0, s4 -; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0 ; CHECK-NEXT: s_addc_u32 s1, s1, s5 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 @@ -215,8 +207,8 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v7 ; CHECK-NEXT: v_readfirstlane_b32 s2, v6 -; CHECK-NEXT: v_mov_b32_e32 v4, s10 -; CHECK-NEXT: v_mov_b32_e32 v5, s8 +; CHECK-NEXT: v_mov_b32_e32 v4, s6 +; CHECK-NEXT: v_mov_b32_e32 v5, s7 ; CHECK-NEXT: s_mov_b32 s1, s0 ; CHECK-NEXT: s_mov_b32 s3, s2 ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] @@ -233,14 +225,10 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) { ; CHECK-LABEL: s_usub_v2i64: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_sub_u32 s10, s2, s6 -; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[8:9], 0 -; CHECK-NEXT: s_subb_u32 s8, s3, s7 +; CHECK-NEXT: s_sub_u32 s6, s2, s6 +; CHECK-NEXT: s_subb_u32 s7, s3, s7 ; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 ; CHECK-NEXT: s_sub_u32 s0, s0, s4 -; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0 ; CHECK-NEXT: s_subb_u32 s1, s1, s5 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 @@ -249,8 +237,8 @@ define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v7 ; CHECK-NEXT: v_readfirstlane_b32 s2, v6 -; CHECK-NEXT: v_mov_b32_e32 v4, s10 -; CHECK-NEXT: v_mov_b32_e32 v5, s8 +; CHECK-NEXT: v_mov_b32_e32 v4, s6 +; CHECK-NEXT: v_mov_b32_e32 v5, s7 ; CHECK-NEXT: s_mov_b32 s1, s0 ; CHECK-NEXT: s_mov_b32 s3, s2 ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] @@ -268,8 +256,6 @@ define amdgpu_ps i64 @s_uadd_i64(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) ; CHECK-LABEL: s_uadd_i64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 s0, s0, s2 -; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0 ; CHECK-NEXT: s_addc_u32 s1, s1, s3 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 @@ -292,8 +278,6 @@ define amdgpu_ps i64 @s_uadd_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) { ; CHECK-LABEL: s_uadd_p1: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 s0, s0, 1 -; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 @@ -339,8 +323,6 @@ define amdgpu_ps i64 @s_usub_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) { ; CHECK-LABEL: s_usub_p1: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_sub_u32 s0, s0, 1 -; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0 ; CHECK-NEXT: s_subb_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 @@ -363,8 +345,6 @@ define amdgpu_ps i64 @s_usub_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) { ; CHECK-LABEL: s_usub_n1: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_sub_u32 s0, s0, -1 -; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0 ; CHECK-NEXT: s_subb_u32 s1, s1, -1 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index 948811e..51df8c3 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -7821,10 +7821,9 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_addc_u32 s15, 0, s16 ; GFX6-NEXT: s_add_u32 s16, s0, s1 ; GFX6-NEXT: v_mov_b32_e32 v0, s16 -; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 +; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_cmp_lg_u32 s0, 0 ; GFX6-NEXT: s_addc_u32 s14, s14, s15 ; GFX6-NEXT: s_mul_i32 s0, s12, s14 ; GFX6-NEXT: v_readfirstlane_b32 s1, v0 @@ -7855,7 +7854,6 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_add_u32 s15, s16, s0 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_cmp_lg_u32 s0, 0 ; GFX6-NEXT: s_addc_u32 s14, s14, s12 ; GFX6-NEXT: s_ashr_i32 s12, s7, 31 ; GFX6-NEXT: s_add_u32 s0, s6, s12 @@ -7881,52 +7879,50 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: v_readfirstlane_b32 s4, v0 ; GFX6-NEXT: s_addc_u32 s4, s4, 0 ; GFX6-NEXT: s_mul_i32 s14, s7, s14 -; GFX6-NEXT: s_add_u32 s14, s1, s14 -; GFX6-NEXT: v_mov_b32_e32 v0, s14 +; GFX6-NEXT: s_add_u32 s16, s1, s14 +; GFX6-NEXT: v_mov_b32_e32 v0, s16 ; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0 -; GFX6-NEXT: s_addc_u32 s15, 0, s4 +; GFX6-NEXT: s_addc_u32 s17, 0, s4 ; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: s_mul_i32 s4, s10, s15 +; GFX6-NEXT: s_mul_i32 s4, s10, s17 ; GFX6-NEXT: v_readfirstlane_b32 s5, v0 ; GFX6-NEXT: s_add_i32 s4, s5, s4 -; GFX6-NEXT: s_mul_i32 s5, s11, s14 -; GFX6-NEXT: s_add_i32 s16, s4, s5 -; GFX6-NEXT: s_sub_i32 s17, s7, s16 -; GFX6-NEXT: s_mul_i32 s4, s10, s14 +; GFX6-NEXT: s_mul_i32 s5, s11, s16 +; GFX6-NEXT: s_add_i32 s18, s4, s5 +; GFX6-NEXT: s_sub_i32 s14, s7, s18 +; GFX6-NEXT: s_mul_i32 s4, s10, s16 ; GFX6-NEXT: s_sub_u32 s6, s6, s4 ; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX6-NEXT: s_or_b32 s18, s4, s5 -; GFX6-NEXT: s_cmp_lg_u32 s18, 0 -; GFX6-NEXT: s_subb_u32 s17, s17, s11 -; GFX6-NEXT: s_sub_u32 s19, s6, s10 -; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX6-NEXT: s_or_b32 s15, s4, s5 +; GFX6-NEXT: s_subb_u32 s19, s14, s11 +; GFX6-NEXT: s_sub_u32 s20, s6, s10 +; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX6-NEXT: s_or_b32 s14, s14, s15 +; GFX6-NEXT: s_subb_u32 s14, s19, 0 +; GFX6-NEXT: s_cmp_ge_u32 s14, s11 +; GFX6-NEXT: s_cselect_b32 s15, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s20, s10 +; GFX6-NEXT: s_cselect_b32 s19, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s14, s11 +; GFX6-NEXT: s_cselect_b32 s14, s19, s15 +; GFX6-NEXT: s_add_u32 s15, s16, 1 +; GFX6-NEXT: s_addc_u32 s19, s17, 0 +; GFX6-NEXT: s_add_u32 s20, s16, 2 +; GFX6-NEXT: s_addc_u32 s21, s17, 0 +; GFX6-NEXT: s_cmp_lg_u32 s14, 0 +; GFX6-NEXT: s_cselect_b32 s14, s20, s15 +; GFX6-NEXT: s_cselect_b32 s15, s21, s19 ; GFX6-NEXT: s_or_b32 s4, s4, s5 -; GFX6-NEXT: s_cmp_lg_u32 s4, 0 -; GFX6-NEXT: s_subb_u32 s4, s17, 0 +; GFX6-NEXT: s_subb_u32 s4, s7, s18 ; GFX6-NEXT: s_cmp_ge_u32 s4, s11 ; GFX6-NEXT: s_cselect_b32 s5, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s19, s10 -; GFX6-NEXT: s_cselect_b32 s17, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s4, s11 -; GFX6-NEXT: s_cselect_b32 s4, s17, s5 -; GFX6-NEXT: s_add_u32 s5, s14, 1 -; GFX6-NEXT: s_addc_u32 s17, s15, 0 -; GFX6-NEXT: s_add_u32 s19, s14, 2 -; GFX6-NEXT: s_addc_u32 s20, s15, 0 -; GFX6-NEXT: s_cmp_lg_u32 s4, 0 -; GFX6-NEXT: s_cselect_b32 s4, s19, s5 -; GFX6-NEXT: s_cselect_b32 s5, s20, s17 -; GFX6-NEXT: s_cmp_lg_u32 s18, 0 -; GFX6-NEXT: s_subb_u32 s7, s7, s16 -; GFX6-NEXT: s_cmp_ge_u32 s7, s11 -; GFX6-NEXT: s_cselect_b32 s16, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s6, s10 ; GFX6-NEXT: s_cselect_b32 s6, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s7, s11 -; GFX6-NEXT: s_cselect_b32 s6, s6, s16 -; GFX6-NEXT: s_cmp_lg_u32 s6, 0 -; GFX6-NEXT: s_cselect_b32 s5, s5, s15 -; GFX6-NEXT: s_cselect_b32 s4, s4, s14 +; GFX6-NEXT: s_cmp_eq_u32 s4, s11 +; GFX6-NEXT: s_cselect_b32 s4, s6, s5 +; GFX6-NEXT: s_cmp_lg_u32 s4, 0 +; GFX6-NEXT: s_cselect_b32 s5, s15, s17 +; GFX6-NEXT: s_cselect_b32 s4, s14, s16 ; GFX6-NEXT: s_xor_b64 s[6:7], s[12:13], s[8:9] ; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GFX6-NEXT: s_sub_u32 s4, s4, s6 @@ -7949,8 +7945,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_sub_u32 s10, 0, s8 -; GFX9-NEXT: s_subb_u32 s11, 0, s9 +; GFX9-NEXT: s_sub_u32 s4, 0, s8 +; GFX9-NEXT: s_subb_u32 s5, 0, s9 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -7960,56 +7956,52 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s12, v2 -; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: s_mul_i32 s5, s10, s12 -; GFX9-NEXT: s_mul_hi_u32 s14, s10, s4 -; GFX9-NEXT: s_mul_i32 s13, s11, s4 -; GFX9-NEXT: s_add_i32 s5, s14, s5 -; GFX9-NEXT: s_mul_i32 s15, s10, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s13 -; GFX9-NEXT: s_mul_hi_u32 s14, s4, s15 -; GFX9-NEXT: s_mul_i32 s16, s4, s5 -; GFX9-NEXT: s_mul_hi_u32 s13, s4, s5 +; GFX9-NEXT: v_readfirstlane_b32 s10, v2 +; GFX9-NEXT: v_readfirstlane_b32 s11, v1 +; GFX9-NEXT: s_mul_i32 s12, s4, s10 +; GFX9-NEXT: s_mul_hi_u32 s14, s4, s11 +; GFX9-NEXT: s_mul_i32 s13, s5, s11 +; GFX9-NEXT: s_add_i32 s12, s14, s12 +; GFX9-NEXT: s_mul_i32 s15, s4, s11 +; GFX9-NEXT: s_add_i32 s12, s12, s13 +; GFX9-NEXT: s_mul_hi_u32 s14, s11, s15 +; GFX9-NEXT: s_mul_i32 s16, s11, s12 +; GFX9-NEXT: s_mul_hi_u32 s13, s11, s12 ; GFX9-NEXT: s_add_u32 s14, s14, s16 ; GFX9-NEXT: s_addc_u32 s13, 0, s13 -; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15 -; GFX9-NEXT: s_mul_i32 s15, s12, s15 +; GFX9-NEXT: s_mul_hi_u32 s17, s10, s15 +; GFX9-NEXT: s_mul_i32 s15, s10, s15 ; GFX9-NEXT: s_add_u32 s14, s14, s15 -; GFX9-NEXT: s_mul_hi_u32 s16, s12, s5 +; GFX9-NEXT: s_mul_hi_u32 s16, s10, s12 ; GFX9-NEXT: s_addc_u32 s13, s13, s17 ; GFX9-NEXT: s_addc_u32 s14, s16, 0 -; GFX9-NEXT: s_mul_i32 s5, s12, s5 -; GFX9-NEXT: s_add_u32 s5, s13, s5 +; GFX9-NEXT: s_mul_i32 s12, s10, s12 +; GFX9-NEXT: s_add_u32 s12, s13, s12 ; GFX9-NEXT: s_addc_u32 s13, 0, s14 -; GFX9-NEXT: s_add_u32 s14, s4, s5 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_addc_u32 s12, s12, s13 -; GFX9-NEXT: s_mul_i32 s4, s10, s12 -; GFX9-NEXT: s_mul_hi_u32 s5, s10, s14 -; GFX9-NEXT: s_add_i32 s4, s5, s4 -; GFX9-NEXT: s_mul_i32 s11, s11, s14 -; GFX9-NEXT: s_add_i32 s4, s4, s11 -; GFX9-NEXT: s_mul_i32 s10, s10, s14 -; GFX9-NEXT: s_mul_hi_u32 s11, s12, s10 -; GFX9-NEXT: s_mul_i32 s13, s12, s10 -; GFX9-NEXT: s_mul_i32 s16, s14, s4 -; GFX9-NEXT: s_mul_hi_u32 s10, s14, s10 -; GFX9-NEXT: s_mul_hi_u32 s15, s14, s4 -; GFX9-NEXT: s_add_u32 s10, s10, s16 +; GFX9-NEXT: s_add_u32 s11, s11, s12 +; GFX9-NEXT: s_addc_u32 s10, s10, s13 +; GFX9-NEXT: s_mul_i32 s12, s4, s10 +; GFX9-NEXT: s_mul_hi_u32 s13, s4, s11 +; GFX9-NEXT: s_add_i32 s12, s13, s12 +; GFX9-NEXT: s_mul_i32 s5, s5, s11 +; GFX9-NEXT: s_add_i32 s12, s12, s5 +; GFX9-NEXT: s_mul_i32 s4, s4, s11 +; GFX9-NEXT: s_mul_hi_u32 s13, s10, s4 +; GFX9-NEXT: s_mul_i32 s14, s10, s4 +; GFX9-NEXT: s_mul_i32 s16, s11, s12 +; GFX9-NEXT: s_mul_hi_u32 s4, s11, s4 +; GFX9-NEXT: s_mul_hi_u32 s15, s11, s12 +; GFX9-NEXT: s_add_u32 s4, s4, s16 ; GFX9-NEXT: s_addc_u32 s15, 0, s15 -; GFX9-NEXT: s_add_u32 s10, s10, s13 -; GFX9-NEXT: s_mul_hi_u32 s5, s12, s4 -; GFX9-NEXT: s_addc_u32 s10, s15, s11 +; GFX9-NEXT: s_add_u32 s4, s4, s14 +; GFX9-NEXT: s_mul_hi_u32 s5, s10, s12 +; GFX9-NEXT: s_addc_u32 s4, s15, s13 ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: s_mul_i32 s4, s12, s4 -; GFX9-NEXT: s_add_u32 s4, s10, s4 -; GFX9-NEXT: s_addc_u32 s10, 0, s5 -; GFX9-NEXT: s_add_u32 s11, s14, s4 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_addc_u32 s10, s12, s10 +; GFX9-NEXT: s_mul_i32 s12, s10, s12 +; GFX9-NEXT: s_add_u32 s4, s4, s12 +; GFX9-NEXT: s_addc_u32 s5, 0, s5 +; GFX9-NEXT: s_add_u32 s11, s11, s4 +; GFX9-NEXT: s_addc_u32 s10, s10, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 ; GFX9-NEXT: s_add_u32 s2, s2, s4 @@ -8028,38 +8020,35 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: s_addc_u32 s11, s12, s15 ; GFX9-NEXT: s_addc_u32 s12, s14, 0 ; GFX9-NEXT: s_mul_i32 s10, s3, s10 -; GFX9-NEXT: s_add_u32 s14, s11, s10 -; GFX9-NEXT: s_addc_u32 s15, 0, s12 -; GFX9-NEXT: s_mul_i32 s10, s8, s15 -; GFX9-NEXT: s_mul_hi_u32 s11, s8, s14 +; GFX9-NEXT: s_add_u32 s13, s11, s10 +; GFX9-NEXT: s_addc_u32 s12, 0, s12 +; GFX9-NEXT: s_mul_i32 s10, s8, s12 +; GFX9-NEXT: s_mul_hi_u32 s11, s8, s13 ; GFX9-NEXT: s_add_i32 s10, s11, s10 -; GFX9-NEXT: s_mul_i32 s11, s9, s14 -; GFX9-NEXT: s_add_i32 s16, s10, s11 -; GFX9-NEXT: s_sub_i32 s12, s3, s16 -; GFX9-NEXT: s_mul_i32 s10, s8, s14 +; GFX9-NEXT: s_mul_i32 s11, s9, s13 +; GFX9-NEXT: s_add_i32 s14, s10, s11 +; GFX9-NEXT: s_sub_i32 s15, s3, s14 +; GFX9-NEXT: s_mul_i32 s10, s8, s13 ; GFX9-NEXT: s_sub_u32 s2, s2, s10 ; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s17, s12, s9 -; GFX9-NEXT: s_sub_u32 s18, s2, s8 -; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 -; GFX9-NEXT: s_subb_u32 s12, s17, 0 -; GFX9-NEXT: s_cmp_ge_u32 s12, s9 -; GFX9-NEXT: s_cselect_b32 s13, -1, 0 -; GFX9-NEXT: s_cmp_ge_u32 s18, s8 +; GFX9-NEXT: s_subb_u32 s15, s15, s9 +; GFX9-NEXT: s_sub_u32 s16, s2, s8 +; GFX9-NEXT: s_subb_u32 s15, s15, 0 +; GFX9-NEXT: s_cmp_ge_u32 s15, s9 ; GFX9-NEXT: s_cselect_b32 s17, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s12, s9 -; GFX9-NEXT: s_cselect_b32 s12, s17, s13 -; GFX9-NEXT: s_add_u32 s13, s14, 1 -; GFX9-NEXT: s_addc_u32 s17, s15, 0 -; GFX9-NEXT: s_add_u32 s18, s14, 2 -; GFX9-NEXT: s_addc_u32 s19, s15, 0 -; GFX9-NEXT: s_cmp_lg_u32 s12, 0 -; GFX9-NEXT: s_cselect_b32 s12, s18, s13 -; GFX9-NEXT: s_cselect_b32 s13, s19, s17 +; GFX9-NEXT: s_cmp_ge_u32 s16, s8 +; GFX9-NEXT: s_cselect_b32 s16, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s15, s9 +; GFX9-NEXT: s_cselect_b32 s15, s16, s17 +; GFX9-NEXT: s_add_u32 s16, s13, 1 +; GFX9-NEXT: s_addc_u32 s17, s12, 0 +; GFX9-NEXT: s_add_u32 s18, s13, 2 +; GFX9-NEXT: s_addc_u32 s19, s12, 0 +; GFX9-NEXT: s_cmp_lg_u32 s15, 0 +; GFX9-NEXT: s_cselect_b32 s15, s18, s16 +; GFX9-NEXT: s_cselect_b32 s16, s19, s17 ; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s3, s3, s16 +; GFX9-NEXT: s_subb_u32 s3, s3, s14 ; GFX9-NEXT: s_cmp_ge_u32 s3, s9 ; GFX9-NEXT: s_cselect_b32 s10, -1, 0 ; GFX9-NEXT: s_cmp_ge_u32 s2, s8 @@ -8067,8 +8056,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: s_cmp_eq_u32 s3, s9 ; GFX9-NEXT: s_cselect_b32 s2, s2, s10 ; GFX9-NEXT: s_cmp_lg_u32 s2, 0 -; GFX9-NEXT: s_cselect_b32 s3, s13, s15 -; GFX9-NEXT: s_cselect_b32 s2, s12, s14 +; GFX9-NEXT: s_cselect_b32 s3, s16, s12 +; GFX9-NEXT: s_cselect_b32 s2, s15, s13 ; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] ; GFX9-NEXT: s_sub_u32 s2, s2, s4 @@ -8328,10 +8317,9 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_addc_u32 s17, 0, s18 ; GFX6-NEXT: s_add_u32 s18, s12, s13 ; GFX6-NEXT: v_mov_b32_e32 v0, s18 -; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GFX6-NEXT: v_mul_hi_u32 v0, s14, v0 +; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 ; GFX6-NEXT: s_addc_u32 s16, s16, s17 ; GFX6-NEXT: s_mul_i32 s12, s14, s16 ; GFX6-NEXT: v_readfirstlane_b32 s13, v0 @@ -8362,7 +8350,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_add_u32 s15, s18, s12 ; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 ; GFX6-NEXT: s_addc_u32 s14, s16, s14 ; GFX6-NEXT: s_ashr_i32 s12, s9, 31 ; GFX6-NEXT: s_add_u32 s8, s8, s12 @@ -8387,55 +8374,53 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_readfirstlane_b32 s16, v0 ; GFX6-NEXT: s_addc_u32 s16, s16, 0 ; GFX6-NEXT: s_mul_i32 s14, s9, s14 -; GFX6-NEXT: s_add_u32 s17, s15, s14 -; GFX6-NEXT: v_mov_b32_e32 v0, s17 +; GFX6-NEXT: s_add_u32 s18, s15, s14 +; GFX6-NEXT: v_mov_b32_e32 v0, s18 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 -; GFX6-NEXT: s_addc_u32 s16, 0, s16 -; GFX6-NEXT: s_mul_i32 s14, s6, s16 +; GFX6-NEXT: s_addc_u32 s19, 0, s16 +; GFX6-NEXT: s_mul_i32 s14, s6, s19 ; GFX6-NEXT: v_readfirstlane_b32 s15, v0 ; GFX6-NEXT: s_add_i32 s14, s15, s14 -; GFX6-NEXT: s_mul_i32 s15, s7, s17 -; GFX6-NEXT: s_add_i32 s18, s14, s15 -; GFX6-NEXT: s_sub_i32 s19, s9, s18 -; GFX6-NEXT: s_mul_i32 s14, s6, s17 +; GFX6-NEXT: s_mul_i32 s15, s7, s18 +; GFX6-NEXT: s_add_i32 s20, s14, s15 +; GFX6-NEXT: s_sub_i32 s16, s9, s20 +; GFX6-NEXT: s_mul_i32 s14, s6, s18 ; GFX6-NEXT: s_sub_u32 s8, s8, s14 ; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX6-NEXT: s_or_b32 s20, s14, s15 -; GFX6-NEXT: s_cmp_lg_u32 s20, 0 -; GFX6-NEXT: s_subb_u32 s19, s19, s7 -; GFX6-NEXT: s_sub_u32 s21, s8, s6 -; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX6-NEXT: s_or_b32 s17, s14, s15 +; GFX6-NEXT: s_subb_u32 s21, s16, s7 +; GFX6-NEXT: s_sub_u32 s22, s8, s6 +; GFX6-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GFX6-NEXT: s_or_b32 s16, s16, s17 +; GFX6-NEXT: s_subb_u32 s16, s21, 0 +; GFX6-NEXT: s_cmp_ge_u32 s16, s7 +; GFX6-NEXT: s_cselect_b32 s17, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s22, s6 +; GFX6-NEXT: s_cselect_b32 s21, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s16, s7 +; GFX6-NEXT: s_cselect_b32 s16, s21, s17 +; GFX6-NEXT: s_add_u32 s17, s18, 1 +; GFX6-NEXT: s_addc_u32 s21, s19, 0 +; GFX6-NEXT: s_add_u32 s22, s18, 2 +; GFX6-NEXT: s_addc_u32 s23, s19, 0 +; GFX6-NEXT: s_cmp_lg_u32 s16, 0 +; GFX6-NEXT: s_cselect_b32 s16, s22, s17 +; GFX6-NEXT: s_cselect_b32 s17, s23, s21 ; GFX6-NEXT: s_or_b32 s14, s14, s15 -; GFX6-NEXT: s_cmp_lg_u32 s14, 0 -; GFX6-NEXT: s_subb_u32 s14, s19, 0 -; GFX6-NEXT: s_cmp_ge_u32 s14, s7 -; GFX6-NEXT: s_cselect_b32 s15, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s21, s6 -; GFX6-NEXT: s_cselect_b32 s19, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s14, s7 -; GFX6-NEXT: s_cselect_b32 s14, s19, s15 -; GFX6-NEXT: s_add_u32 s15, s17, 1 -; GFX6-NEXT: s_addc_u32 s19, s16, 0 -; GFX6-NEXT: s_add_u32 s21, s17, 2 -; GFX6-NEXT: s_addc_u32 s22, s16, 0 -; GFX6-NEXT: s_cmp_lg_u32 s14, 0 -; GFX6-NEXT: s_cselect_b32 s14, s21, s15 -; GFX6-NEXT: s_cselect_b32 s15, s22, s19 -; GFX6-NEXT: s_cmp_lg_u32 s20, 0 -; GFX6-NEXT: s_subb_u32 s9, s9, s18 +; GFX6-NEXT: s_subb_u32 s9, s9, s20 ; GFX6-NEXT: s_cmp_ge_u32 s9, s7 -; GFX6-NEXT: s_cselect_b32 s18, -1, 0 +; GFX6-NEXT: s_cselect_b32 s14, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s8, s6 ; GFX6-NEXT: s_cselect_b32 s6, -1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s9, s7 -; GFX6-NEXT: s_cselect_b32 s6, s6, s18 +; GFX6-NEXT: s_cselect_b32 s6, s6, s14 ; GFX6-NEXT: s_cmp_lg_u32 s6, 0 -; GFX6-NEXT: s_cselect_b32 s7, s15, s16 -; GFX6-NEXT: s_cselect_b32 s6, s14, s17 +; GFX6-NEXT: s_cselect_b32 s7, s17, s19 +; GFX6-NEXT: s_cselect_b32 s6, s16, s18 ; GFX6-NEXT: s_xor_b64 s[2:3], s[12:13], s[2:3] ; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] -; GFX6-NEXT: s_sub_u32 s14, s6, s2 -; GFX6-NEXT: s_subb_u32 s15, s7, s3 +; GFX6-NEXT: s_sub_u32 s16, s6, s2 +; GFX6-NEXT: s_subb_u32 s17, s7, s3 ; GFX6-NEXT: s_ashr_i32 s6, s1, 31 ; GFX6-NEXT: s_add_u32 s0, s0, s6 ; GFX6-NEXT: s_mov_b32 s7, s6 @@ -8454,40 +8439,39 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0 -; GFX6-NEXT: v_readfirstlane_b32 s16, v1 +; GFX6-NEXT: v_readfirstlane_b32 s14, v1 ; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s1, s12, s16 +; GFX6-NEXT: s_mul_i32 s1, s12, s14 ; GFX6-NEXT: v_readfirstlane_b32 s3, v2 ; GFX6-NEXT: s_mul_i32 s0, s13, s2 ; GFX6-NEXT: s_add_i32 s1, s3, s1 ; GFX6-NEXT: s_add_i32 s3, s1, s0 -; GFX6-NEXT: s_mul_i32 s17, s12, s2 +; GFX6-NEXT: s_mul_i32 s15, s12, s2 ; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3 -; GFX6-NEXT: v_mul_hi_u32 v0, v0, s17 +; GFX6-NEXT: v_mul_hi_u32 v0, v0, s15 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mul_i32 s4, s2, s3 ; GFX6-NEXT: v_readfirstlane_b32 s5, v2 ; GFX6-NEXT: v_readfirstlane_b32 s18, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, v1, s17 +; GFX6-NEXT: v_mul_hi_u32 v0, v1, s15 ; GFX6-NEXT: v_mul_hi_u32 v1, v1, s3 ; GFX6-NEXT: s_add_u32 s4, s18, s4 ; GFX6-NEXT: s_addc_u32 s5, 0, s5 -; GFX6-NEXT: s_mul_i32 s17, s16, s17 +; GFX6-NEXT: s_mul_i32 s15, s14, s15 ; GFX6-NEXT: v_readfirstlane_b32 s18, v0 -; GFX6-NEXT: s_add_u32 s4, s4, s17 +; GFX6-NEXT: s_add_u32 s4, s4, s15 ; GFX6-NEXT: s_addc_u32 s4, s5, s18 ; GFX6-NEXT: v_readfirstlane_b32 s5, v1 ; GFX6-NEXT: s_addc_u32 s5, s5, 0 -; GFX6-NEXT: s_mul_i32 s3, s16, s3 +; GFX6-NEXT: s_mul_i32 s3, s14, s3 ; GFX6-NEXT: s_add_u32 s3, s4, s3 ; GFX6-NEXT: s_addc_u32 s4, 0, s5 ; GFX6-NEXT: s_add_u32 s5, s2, s3 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 +; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_cmp_lg_u32 s2, 0 -; GFX6-NEXT: s_addc_u32 s4, s16, s4 +; GFX6-NEXT: s_addc_u32 s4, s14, s4 ; GFX6-NEXT: s_mul_i32 s2, s12, s4 ; GFX6-NEXT: v_readfirstlane_b32 s3, v0 ; GFX6-NEXT: s_add_i32 s2, s3, s2 @@ -8501,14 +8485,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX6-NEXT: s_mul_i32 s13, s5, s2 -; GFX6-NEXT: v_readfirstlane_b32 s17, v2 -; GFX6-NEXT: s_add_u32 s13, s17, s13 -; GFX6-NEXT: v_readfirstlane_b32 s16, v0 +; GFX6-NEXT: v_readfirstlane_b32 s15, v2 +; GFX6-NEXT: s_add_u32 s13, s15, s13 +; GFX6-NEXT: v_readfirstlane_b32 s14, v0 ; GFX6-NEXT: s_mul_i32 s3, s4, s3 -; GFX6-NEXT: s_addc_u32 s16, 0, s16 +; GFX6-NEXT: s_addc_u32 s14, 0, s14 ; GFX6-NEXT: v_readfirstlane_b32 s12, v3 ; GFX6-NEXT: s_add_u32 s3, s13, s3 -; GFX6-NEXT: s_addc_u32 s3, s16, s12 +; GFX6-NEXT: s_addc_u32 s3, s14, s12 ; GFX6-NEXT: v_readfirstlane_b32 s12, v1 ; GFX6-NEXT: s_addc_u32 s12, s12, 0 ; GFX6-NEXT: s_mul_i32 s2, s4, s2 @@ -8517,7 +8501,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_add_u32 s13, s5, s2 ; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_cmp_lg_u32 s2, 0 ; GFX6-NEXT: s_addc_u32 s12, s4, s12 ; GFX6-NEXT: s_ashr_i32 s4, s11, 31 ; GFX6-NEXT: s_add_u32 s2, s10, s4 @@ -8529,72 +8512,70 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mov_b32_e32 v2, s13 ; GFX6-NEXT: v_mul_hi_u32 v3, s10, v2 ; GFX6-NEXT: s_mul_i32 s2, s10, s12 -; GFX6-NEXT: v_readfirstlane_b32 s16, v1 +; GFX6-NEXT: v_readfirstlane_b32 s14, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s11, v2 -; GFX6-NEXT: v_readfirstlane_b32 s17, v3 +; GFX6-NEXT: v_readfirstlane_b32 s15, v3 ; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0 -; GFX6-NEXT: s_add_u32 s2, s17, s2 -; GFX6-NEXT: s_addc_u32 s16, 0, s16 +; GFX6-NEXT: s_add_u32 s2, s15, s2 +; GFX6-NEXT: s_addc_u32 s14, 0, s14 ; GFX6-NEXT: s_mul_i32 s13, s11, s13 -; GFX6-NEXT: v_readfirstlane_b32 s17, v1 +; GFX6-NEXT: v_readfirstlane_b32 s15, v1 ; GFX6-NEXT: s_add_u32 s2, s2, s13 -; GFX6-NEXT: s_addc_u32 s2, s16, s17 +; GFX6-NEXT: s_addc_u32 s2, s14, s15 ; GFX6-NEXT: v_readfirstlane_b32 s13, v0 ; GFX6-NEXT: s_addc_u32 s13, s13, 0 ; GFX6-NEXT: s_mul_i32 s12, s11, s12 -; GFX6-NEXT: s_add_u32 s16, s2, s12 -; GFX6-NEXT: v_mov_b32_e32 v0, s16 +; GFX6-NEXT: s_add_u32 s18, s2, s12 +; GFX6-NEXT: v_mov_b32_e32 v0, s18 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX6-NEXT: s_addc_u32 s17, 0, s13 -; GFX6-NEXT: s_mul_i32 s12, s8, s17 +; GFX6-NEXT: s_addc_u32 s19, 0, s13 +; GFX6-NEXT: s_mul_i32 s12, s8, s19 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_readfirstlane_b32 s13, v0 ; GFX6-NEXT: s_add_i32 s12, s13, s12 -; GFX6-NEXT: s_mul_i32 s13, s9, s16 -; GFX6-NEXT: s_add_i32 s18, s12, s13 -; GFX6-NEXT: s_sub_i32 s19, s11, s18 -; GFX6-NEXT: s_mul_i32 s12, s8, s16 +; GFX6-NEXT: s_mul_i32 s13, s9, s18 +; GFX6-NEXT: s_add_i32 s20, s12, s13 +; GFX6-NEXT: s_sub_i32 s14, s11, s20 +; GFX6-NEXT: s_mul_i32 s12, s8, s18 ; GFX6-NEXT: s_sub_u32 s10, s10, s12 ; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s20, s12, s13 -; GFX6-NEXT: s_cmp_lg_u32 s20, 0 -; GFX6-NEXT: s_subb_u32 s19, s19, s9 -; GFX6-NEXT: s_sub_u32 s21, s10, s8 -; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX6-NEXT: s_or_b32 s15, s12, s13 +; GFX6-NEXT: s_subb_u32 s21, s14, s9 +; GFX6-NEXT: s_sub_u32 s22, s10, s8 +; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX6-NEXT: s_or_b32 s14, s14, s15 +; GFX6-NEXT: s_subb_u32 s14, s21, 0 +; GFX6-NEXT: s_cmp_ge_u32 s14, s9 +; GFX6-NEXT: s_cselect_b32 s15, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s22, s8 +; GFX6-NEXT: s_cselect_b32 s21, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s14, s9 +; GFX6-NEXT: s_cselect_b32 s14, s21, s15 +; GFX6-NEXT: s_add_u32 s15, s18, 1 +; GFX6-NEXT: s_addc_u32 s21, s19, 0 +; GFX6-NEXT: s_add_u32 s22, s18, 2 +; GFX6-NEXT: s_addc_u32 s23, s19, 0 +; GFX6-NEXT: s_cmp_lg_u32 s14, 0 +; GFX6-NEXT: s_cselect_b32 s14, s22, s15 +; GFX6-NEXT: s_cselect_b32 s15, s23, s21 ; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_subb_u32 s12, s19, 0 -; GFX6-NEXT: s_cmp_ge_u32 s12, s9 -; GFX6-NEXT: s_cselect_b32 s13, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s21, s8 -; GFX6-NEXT: s_cselect_b32 s19, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s12, s9 -; GFX6-NEXT: s_cselect_b32 s12, s19, s13 -; GFX6-NEXT: s_add_u32 s13, s16, 1 -; GFX6-NEXT: s_addc_u32 s19, s17, 0 -; GFX6-NEXT: s_add_u32 s21, s16, 2 -; GFX6-NEXT: s_addc_u32 s22, s17, 0 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_cselect_b32 s12, s21, s13 -; GFX6-NEXT: s_cselect_b32 s13, s22, s19 -; GFX6-NEXT: s_cmp_lg_u32 s20, 0 -; GFX6-NEXT: s_subb_u32 s11, s11, s18 +; GFX6-NEXT: s_subb_u32 s11, s11, s20 ; GFX6-NEXT: s_cmp_ge_u32 s11, s9 -; GFX6-NEXT: s_cselect_b32 s18, -1, 0 +; GFX6-NEXT: s_cselect_b32 s12, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s10, s8 ; GFX6-NEXT: s_cselect_b32 s8, -1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s11, s9 -; GFX6-NEXT: s_cselect_b32 s8, s8, s18 +; GFX6-NEXT: s_cselect_b32 s8, s8, s12 ; GFX6-NEXT: s_cmp_lg_u32 s8, 0 -; GFX6-NEXT: s_cselect_b32 s9, s13, s17 -; GFX6-NEXT: s_cselect_b32 s8, s12, s16 +; GFX6-NEXT: s_cselect_b32 s9, s15, s19 +; GFX6-NEXT: s_cselect_b32 s8, s14, s18 ; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GFX6-NEXT: s_xor_b64 s[6:7], s[8:9], s[4:5] ; GFX6-NEXT: s_sub_u32 s4, s6, s4 ; GFX6-NEXT: s_subb_u32 s5, s7, s5 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_mov_b32_e32 v0, s14 -; GFX6-NEXT: v_mov_b32_e32 v1, s15 +; GFX6-NEXT: v_mov_b32_e32 v0, s16 +; GFX6-NEXT: v_mov_b32_e32 v1, s17 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8614,8 +8595,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_sub_u32 s14, 0, s6 -; GFX9-NEXT: s_subb_u32 s15, 0, s7 +; GFX9-NEXT: s_sub_u32 s12, 0, s6 +; GFX9-NEXT: s_subb_u32 s13, 0, s7 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -8624,56 +8605,52 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s16, v1 -; GFX9-NEXT: v_readfirstlane_b32 s12, v0 -; GFX9-NEXT: s_mul_i32 s13, s14, s16 -; GFX9-NEXT: s_mul_hi_u32 s18, s14, s12 -; GFX9-NEXT: s_mul_i32 s17, s15, s12 -; GFX9-NEXT: s_add_i32 s13, s18, s13 -; GFX9-NEXT: s_mul_i32 s19, s14, s12 -; GFX9-NEXT: s_add_i32 s13, s13, s17 -; GFX9-NEXT: s_mul_hi_u32 s18, s12, s19 -; GFX9-NEXT: s_mul_i32 s20, s12, s13 -; GFX9-NEXT: s_mul_hi_u32 s17, s12, s13 +; GFX9-NEXT: v_readfirstlane_b32 s14, v1 +; GFX9-NEXT: v_readfirstlane_b32 s15, v0 +; GFX9-NEXT: s_mul_i32 s16, s12, s14 +; GFX9-NEXT: s_mul_hi_u32 s18, s12, s15 +; GFX9-NEXT: s_mul_i32 s17, s13, s15 +; GFX9-NEXT: s_add_i32 s16, s18, s16 +; GFX9-NEXT: s_mul_i32 s19, s12, s15 +; GFX9-NEXT: s_add_i32 s16, s16, s17 +; GFX9-NEXT: s_mul_hi_u32 s18, s15, s19 +; GFX9-NEXT: s_mul_i32 s20, s15, s16 +; GFX9-NEXT: s_mul_hi_u32 s17, s15, s16 ; GFX9-NEXT: s_add_u32 s18, s18, s20 ; GFX9-NEXT: s_addc_u32 s17, 0, s17 -; GFX9-NEXT: s_mul_hi_u32 s20, s16, s19 -; GFX9-NEXT: s_mul_i32 s19, s16, s19 +; GFX9-NEXT: s_mul_hi_u32 s20, s14, s19 +; GFX9-NEXT: s_mul_i32 s19, s14, s19 ; GFX9-NEXT: s_add_u32 s18, s18, s19 -; GFX9-NEXT: s_mul_hi_u32 s21, s16, s13 +; GFX9-NEXT: s_mul_hi_u32 s21, s14, s16 ; GFX9-NEXT: s_addc_u32 s17, s17, s20 ; GFX9-NEXT: s_addc_u32 s18, s21, 0 -; GFX9-NEXT: s_mul_i32 s13, s16, s13 -; GFX9-NEXT: s_add_u32 s13, s17, s13 +; GFX9-NEXT: s_mul_i32 s16, s14, s16 +; GFX9-NEXT: s_add_u32 s16, s17, s16 ; GFX9-NEXT: s_addc_u32 s17, 0, s18 -; GFX9-NEXT: s_add_u32 s18, s12, s13 -; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 -; GFX9-NEXT: s_addc_u32 s16, s16, s17 -; GFX9-NEXT: s_mul_i32 s12, s14, s16 -; GFX9-NEXT: s_mul_hi_u32 s13, s14, s18 -; GFX9-NEXT: s_add_i32 s12, s13, s12 -; GFX9-NEXT: s_mul_i32 s15, s15, s18 -; GFX9-NEXT: s_add_i32 s12, s12, s15 -; GFX9-NEXT: s_mul_i32 s14, s14, s18 -; GFX9-NEXT: s_mul_hi_u32 s15, s16, s14 -; GFX9-NEXT: s_mul_i32 s17, s16, s14 -; GFX9-NEXT: s_mul_i32 s20, s18, s12 -; GFX9-NEXT: s_mul_hi_u32 s14, s18, s14 -; GFX9-NEXT: s_mul_hi_u32 s19, s18, s12 -; GFX9-NEXT: s_add_u32 s14, s14, s20 +; GFX9-NEXT: s_add_u32 s15, s15, s16 +; GFX9-NEXT: s_addc_u32 s14, s14, s17 +; GFX9-NEXT: s_mul_i32 s16, s12, s14 +; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15 +; GFX9-NEXT: s_add_i32 s16, s17, s16 +; GFX9-NEXT: s_mul_i32 s13, s13, s15 +; GFX9-NEXT: s_add_i32 s16, s16, s13 +; GFX9-NEXT: s_mul_i32 s12, s12, s15 +; GFX9-NEXT: s_mul_hi_u32 s17, s14, s12 +; GFX9-NEXT: s_mul_i32 s18, s14, s12 +; GFX9-NEXT: s_mul_i32 s20, s15, s16 +; GFX9-NEXT: s_mul_hi_u32 s12, s15, s12 +; GFX9-NEXT: s_mul_hi_u32 s19, s15, s16 +; GFX9-NEXT: s_add_u32 s12, s12, s20 ; GFX9-NEXT: s_addc_u32 s19, 0, s19 -; GFX9-NEXT: s_add_u32 s14, s14, s17 -; GFX9-NEXT: s_mul_hi_u32 s13, s16, s12 -; GFX9-NEXT: s_addc_u32 s14, s19, s15 +; GFX9-NEXT: s_add_u32 s12, s12, s18 +; GFX9-NEXT: s_mul_hi_u32 s13, s14, s16 +; GFX9-NEXT: s_addc_u32 s12, s19, s17 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-NEXT: s_mul_i32 s12, s16, s12 -; GFX9-NEXT: s_add_u32 s12, s14, s12 -; GFX9-NEXT: s_addc_u32 s14, 0, s13 -; GFX9-NEXT: s_add_u32 s15, s18, s12 -; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 -; GFX9-NEXT: s_addc_u32 s14, s16, s14 +; GFX9-NEXT: s_mul_i32 s16, s14, s16 +; GFX9-NEXT: s_add_u32 s12, s12, s16 +; GFX9-NEXT: s_addc_u32 s13, 0, s13 +; GFX9-NEXT: s_add_u32 s15, s15, s12 +; GFX9-NEXT: s_addc_u32 s14, s14, s13 ; GFX9-NEXT: s_ashr_i32 s12, s9, 31 ; GFX9-NEXT: s_add_u32 s8, s8, s12 ; GFX9-NEXT: s_mov_b32 s13, s12 @@ -8691,38 +8668,35 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_addc_u32 s15, s16, s19 ; GFX9-NEXT: s_addc_u32 s16, s18, 0 ; GFX9-NEXT: s_mul_i32 s14, s9, s14 -; GFX9-NEXT: s_add_u32 s18, s15, s14 -; GFX9-NEXT: s_addc_u32 s19, 0, s16 -; GFX9-NEXT: s_mul_i32 s14, s6, s19 -; GFX9-NEXT: s_mul_hi_u32 s15, s6, s18 +; GFX9-NEXT: s_add_u32 s17, s15, s14 +; GFX9-NEXT: s_addc_u32 s16, 0, s16 +; GFX9-NEXT: s_mul_i32 s14, s6, s16 +; GFX9-NEXT: s_mul_hi_u32 s15, s6, s17 ; GFX9-NEXT: s_add_i32 s14, s15, s14 -; GFX9-NEXT: s_mul_i32 s15, s7, s18 -; GFX9-NEXT: s_add_i32 s20, s14, s15 -; GFX9-NEXT: s_sub_i32 s16, s9, s20 -; GFX9-NEXT: s_mul_i32 s14, s6, s18 +; GFX9-NEXT: s_mul_i32 s15, s7, s17 +; GFX9-NEXT: s_add_i32 s18, s14, s15 +; GFX9-NEXT: s_sub_i32 s19, s9, s18 +; GFX9-NEXT: s_mul_i32 s14, s6, s17 ; GFX9-NEXT: s_sub_u32 s8, s8, s14 ; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0 -; GFX9-NEXT: s_subb_u32 s21, s16, s7 -; GFX9-NEXT: s_sub_u32 s22, s8, s6 -; GFX9-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[16:17], 0 -; GFX9-NEXT: s_subb_u32 s16, s21, 0 -; GFX9-NEXT: s_cmp_ge_u32 s16, s7 -; GFX9-NEXT: s_cselect_b32 s17, -1, 0 -; GFX9-NEXT: s_cmp_ge_u32 s22, s6 +; GFX9-NEXT: s_subb_u32 s19, s19, s7 +; GFX9-NEXT: s_sub_u32 s20, s8, s6 +; GFX9-NEXT: s_subb_u32 s19, s19, 0 +; GFX9-NEXT: s_cmp_ge_u32 s19, s7 ; GFX9-NEXT: s_cselect_b32 s21, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s16, s7 -; GFX9-NEXT: s_cselect_b32 s16, s21, s17 -; GFX9-NEXT: s_add_u32 s17, s18, 1 -; GFX9-NEXT: s_addc_u32 s21, s19, 0 -; GFX9-NEXT: s_add_u32 s22, s18, 2 -; GFX9-NEXT: s_addc_u32 s23, s19, 0 -; GFX9-NEXT: s_cmp_lg_u32 s16, 0 -; GFX9-NEXT: s_cselect_b32 s16, s22, s17 -; GFX9-NEXT: s_cselect_b32 s17, s23, s21 +; GFX9-NEXT: s_cmp_ge_u32 s20, s6 +; GFX9-NEXT: s_cselect_b32 s20, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s19, s7 +; GFX9-NEXT: s_cselect_b32 s19, s20, s21 +; GFX9-NEXT: s_add_u32 s20, s17, 1 +; GFX9-NEXT: s_addc_u32 s21, s16, 0 +; GFX9-NEXT: s_add_u32 s22, s17, 2 +; GFX9-NEXT: s_addc_u32 s23, s16, 0 +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cselect_b32 s19, s22, s20 +; GFX9-NEXT: s_cselect_b32 s20, s23, s21 ; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0 -; GFX9-NEXT: s_subb_u32 s9, s9, s20 +; GFX9-NEXT: s_subb_u32 s9, s9, s18 ; GFX9-NEXT: s_cmp_ge_u32 s9, s7 ; GFX9-NEXT: s_cselect_b32 s14, -1, 0 ; GFX9-NEXT: s_cmp_ge_u32 s8, s6 @@ -8730,12 +8704,12 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_cmp_eq_u32 s9, s7 ; GFX9-NEXT: s_cselect_b32 s6, s6, s14 ; GFX9-NEXT: s_cmp_lg_u32 s6, 0 -; GFX9-NEXT: s_cselect_b32 s7, s17, s19 -; GFX9-NEXT: s_cselect_b32 s6, s16, s18 +; GFX9-NEXT: s_cselect_b32 s7, s20, s16 +; GFX9-NEXT: s_cselect_b32 s6, s19, s17 ; GFX9-NEXT: s_xor_b64 s[2:3], s[12:13], s[2:3] ; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] -; GFX9-NEXT: s_sub_u32 s14, s6, s2 -; GFX9-NEXT: s_subb_u32 s15, s7, s3 +; GFX9-NEXT: s_sub_u32 s12, s6, s2 +; GFX9-NEXT: s_subb_u32 s13, s7, s3 ; GFX9-NEXT: s_ashr_i32 s2, s1, 31 ; GFX9-NEXT: s_add_u32 s0, s0, s2 ; GFX9-NEXT: s_mov_b32 s3, s2 @@ -8744,8 +8718,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: s_sub_u32 s8, 0, s6 -; GFX9-NEXT: s_subb_u32 s9, 0, s7 +; GFX9-NEXT: s_sub_u32 s4, 0, s6 +; GFX9-NEXT: s_subb_u32 s5, 0, s7 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -8755,105 +8729,98 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: v_readfirstlane_b32 s13, v2 -; GFX9-NEXT: s_mul_hi_u32 s12, s8, s4 -; GFX9-NEXT: s_mul_i32 s16, s8, s13 -; GFX9-NEXT: s_mul_i32 s5, s9, s4 -; GFX9-NEXT: s_add_i32 s12, s12, s16 -; GFX9-NEXT: s_add_i32 s12, s12, s5 -; GFX9-NEXT: s_mul_i32 s17, s8, s4 -; GFX9-NEXT: s_mul_i32 s16, s4, s12 -; GFX9-NEXT: s_mul_hi_u32 s18, s4, s17 -; GFX9-NEXT: s_mul_hi_u32 s5, s4, s12 +; GFX9-NEXT: v_readfirstlane_b32 s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s15, v2 +; GFX9-NEXT: s_mul_hi_u32 s14, s4, s8 +; GFX9-NEXT: s_mul_i32 s16, s4, s15 +; GFX9-NEXT: s_mul_i32 s9, s5, s8 +; GFX9-NEXT: s_add_i32 s14, s14, s16 +; GFX9-NEXT: s_add_i32 s14, s14, s9 +; GFX9-NEXT: s_mul_i32 s17, s4, s8 +; GFX9-NEXT: s_mul_i32 s16, s8, s14 +; GFX9-NEXT: s_mul_hi_u32 s18, s8, s17 +; GFX9-NEXT: s_mul_hi_u32 s9, s8, s14 ; GFX9-NEXT: s_add_u32 s16, s18, s16 -; GFX9-NEXT: s_addc_u32 s5, 0, s5 -; GFX9-NEXT: s_mul_hi_u32 s19, s13, s17 -; GFX9-NEXT: s_mul_i32 s17, s13, s17 +; GFX9-NEXT: s_addc_u32 s9, 0, s9 +; GFX9-NEXT: s_mul_hi_u32 s19, s15, s17 +; GFX9-NEXT: s_mul_i32 s17, s15, s17 ; GFX9-NEXT: s_add_u32 s16, s16, s17 -; GFX9-NEXT: s_mul_hi_u32 s18, s13, s12 -; GFX9-NEXT: s_addc_u32 s5, s5, s19 +; GFX9-NEXT: s_mul_hi_u32 s18, s15, s14 +; GFX9-NEXT: s_addc_u32 s9, s9, s19 ; GFX9-NEXT: s_addc_u32 s16, s18, 0 -; GFX9-NEXT: s_mul_i32 s12, s13, s12 -; GFX9-NEXT: s_add_u32 s5, s5, s12 -; GFX9-NEXT: s_addc_u32 s12, 0, s16 -; GFX9-NEXT: s_add_u32 s16, s4, s5 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_addc_u32 s12, s13, s12 -; GFX9-NEXT: s_mul_i32 s4, s8, s12 -; GFX9-NEXT: s_mul_hi_u32 s5, s8, s16 -; GFX9-NEXT: s_add_i32 s4, s5, s4 -; GFX9-NEXT: s_mul_i32 s9, s9, s16 -; GFX9-NEXT: s_add_i32 s4, s4, s9 -; GFX9-NEXT: s_mul_i32 s8, s8, s16 -; GFX9-NEXT: s_mul_hi_u32 s9, s12, s8 -; GFX9-NEXT: s_mul_i32 s13, s12, s8 -; GFX9-NEXT: s_mul_i32 s18, s16, s4 -; GFX9-NEXT: s_mul_hi_u32 s8, s16, s8 -; GFX9-NEXT: s_mul_hi_u32 s17, s16, s4 -; GFX9-NEXT: s_add_u32 s8, s8, s18 +; GFX9-NEXT: s_mul_i32 s14, s15, s14 +; GFX9-NEXT: s_add_u32 s9, s9, s14 +; GFX9-NEXT: s_addc_u32 s14, 0, s16 +; GFX9-NEXT: s_add_u32 s8, s8, s9 +; GFX9-NEXT: s_addc_u32 s9, s15, s14 +; GFX9-NEXT: s_mul_i32 s14, s4, s9 +; GFX9-NEXT: s_mul_hi_u32 s15, s4, s8 +; GFX9-NEXT: s_add_i32 s14, s15, s14 +; GFX9-NEXT: s_mul_i32 s5, s5, s8 +; GFX9-NEXT: s_add_i32 s14, s14, s5 +; GFX9-NEXT: s_mul_i32 s4, s4, s8 +; GFX9-NEXT: s_mul_hi_u32 s15, s9, s4 +; GFX9-NEXT: s_mul_i32 s16, s9, s4 +; GFX9-NEXT: s_mul_i32 s18, s8, s14 +; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4 +; GFX9-NEXT: s_mul_hi_u32 s17, s8, s14 +; GFX9-NEXT: s_add_u32 s4, s4, s18 ; GFX9-NEXT: s_addc_u32 s17, 0, s17 -; GFX9-NEXT: s_add_u32 s8, s8, s13 -; GFX9-NEXT: s_mul_hi_u32 s5, s12, s4 -; GFX9-NEXT: s_addc_u32 s8, s17, s9 +; GFX9-NEXT: s_add_u32 s4, s4, s16 +; GFX9-NEXT: s_mul_hi_u32 s5, s9, s14 +; GFX9-NEXT: s_addc_u32 s4, s17, s15 ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: s_mul_i32 s4, s12, s4 -; GFX9-NEXT: s_add_u32 s4, s8, s4 -; GFX9-NEXT: s_addc_u32 s8, 0, s5 -; GFX9-NEXT: s_add_u32 s13, s16, s4 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_addc_u32 s12, s12, s8 +; GFX9-NEXT: s_mul_i32 s14, s9, s14 +; GFX9-NEXT: s_add_u32 s4, s4, s14 +; GFX9-NEXT: s_addc_u32 s5, 0, s5 +; GFX9-NEXT: s_add_u32 s14, s8, s4 +; GFX9-NEXT: s_addc_u32 s15, s9, s5 ; GFX9-NEXT: s_ashr_i32 s4, s11, 31 ; GFX9-NEXT: s_add_u32 s8, s10, s4 ; GFX9-NEXT: s_mov_b32 s5, s4 ; GFX9-NEXT: s_addc_u32 s9, s11, s4 ; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[4:5] -; GFX9-NEXT: s_mul_i32 s11, s8, s12 -; GFX9-NEXT: s_mul_hi_u32 s16, s8, s13 -; GFX9-NEXT: s_mul_hi_u32 s10, s8, s12 +; GFX9-NEXT: s_mul_i32 s11, s8, s15 +; GFX9-NEXT: s_mul_hi_u32 s16, s8, s14 +; GFX9-NEXT: s_mul_hi_u32 s10, s8, s15 ; GFX9-NEXT: s_add_u32 s11, s16, s11 ; GFX9-NEXT: s_addc_u32 s10, 0, s10 -; GFX9-NEXT: s_mul_hi_u32 s17, s9, s13 -; GFX9-NEXT: s_mul_i32 s13, s9, s13 -; GFX9-NEXT: s_add_u32 s11, s11, s13 -; GFX9-NEXT: s_mul_hi_u32 s16, s9, s12 +; GFX9-NEXT: s_mul_hi_u32 s17, s9, s14 +; GFX9-NEXT: s_mul_i32 s14, s9, s14 +; GFX9-NEXT: s_add_u32 s11, s11, s14 +; GFX9-NEXT: s_mul_hi_u32 s16, s9, s15 ; GFX9-NEXT: s_addc_u32 s10, s10, s17 ; GFX9-NEXT: s_addc_u32 s11, s16, 0 -; GFX9-NEXT: s_mul_i32 s12, s9, s12 -; GFX9-NEXT: s_add_u32 s16, s10, s12 -; GFX9-NEXT: s_addc_u32 s17, 0, s11 -; GFX9-NEXT: s_mul_i32 s10, s6, s17 -; GFX9-NEXT: s_mul_hi_u32 s11, s6, s16 +; GFX9-NEXT: s_mul_i32 s14, s9, s15 +; GFX9-NEXT: s_add_u32 s14, s10, s14 +; GFX9-NEXT: s_addc_u32 s15, 0, s11 +; GFX9-NEXT: s_mul_i32 s10, s6, s15 +; GFX9-NEXT: s_mul_hi_u32 s11, s6, s14 ; GFX9-NEXT: s_add_i32 s10, s11, s10 -; GFX9-NEXT: s_mul_i32 s11, s7, s16 -; GFX9-NEXT: s_add_i32 s18, s10, s11 -; GFX9-NEXT: s_sub_i32 s12, s9, s18 -; GFX9-NEXT: s_mul_i32 s10, s6, s16 +; GFX9-NEXT: s_mul_i32 s11, s7, s14 +; GFX9-NEXT: s_add_i32 s16, s10, s11 +; GFX9-NEXT: s_sub_i32 s17, s9, s16 +; GFX9-NEXT: s_mul_i32 s10, s6, s14 ; GFX9-NEXT: s_sub_u32 s8, s8, s10 ; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s19, s12, s7 -; GFX9-NEXT: s_sub_u32 s20, s8, s6 -; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 -; GFX9-NEXT: s_subb_u32 s12, s19, 0 -; GFX9-NEXT: s_cmp_ge_u32 s12, s7 -; GFX9-NEXT: s_cselect_b32 s13, -1, 0 -; GFX9-NEXT: s_cmp_ge_u32 s20, s6 +; GFX9-NEXT: s_subb_u32 s17, s17, s7 +; GFX9-NEXT: s_sub_u32 s18, s8, s6 +; GFX9-NEXT: s_subb_u32 s17, s17, 0 +; GFX9-NEXT: s_cmp_ge_u32 s17, s7 ; GFX9-NEXT: s_cselect_b32 s19, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s12, s7 -; GFX9-NEXT: s_cselect_b32 s12, s19, s13 -; GFX9-NEXT: s_add_u32 s13, s16, 1 -; GFX9-NEXT: s_addc_u32 s19, s17, 0 -; GFX9-NEXT: s_add_u32 s20, s16, 2 -; GFX9-NEXT: s_addc_u32 s21, s17, 0 -; GFX9-NEXT: s_cmp_lg_u32 s12, 0 -; GFX9-NEXT: s_cselect_b32 s12, s20, s13 -; GFX9-NEXT: s_cselect_b32 s13, s21, s19 +; GFX9-NEXT: s_cmp_ge_u32 s18, s6 +; GFX9-NEXT: s_cselect_b32 s18, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s17, s7 +; GFX9-NEXT: s_cselect_b32 s17, s18, s19 +; GFX9-NEXT: s_add_u32 s18, s14, 1 +; GFX9-NEXT: s_addc_u32 s19, s15, 0 +; GFX9-NEXT: s_add_u32 s20, s14, 2 +; GFX9-NEXT: s_addc_u32 s21, s15, 0 +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cselect_b32 s17, s20, s18 +; GFX9-NEXT: s_cselect_b32 s18, s21, s19 ; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s9, s9, s18 +; GFX9-NEXT: s_subb_u32 s9, s9, s16 ; GFX9-NEXT: s_cmp_ge_u32 s9, s7 ; GFX9-NEXT: s_cselect_b32 s10, -1, 0 ; GFX9-NEXT: s_cmp_ge_u32 s8, s6 @@ -8861,14 +8828,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_cmp_eq_u32 s9, s7 ; GFX9-NEXT: s_cselect_b32 s6, s6, s10 ; GFX9-NEXT: s_cmp_lg_u32 s6, 0 -; GFX9-NEXT: s_cselect_b32 s7, s13, s17 -; GFX9-NEXT: s_cselect_b32 s6, s12, s16 +; GFX9-NEXT: s_cselect_b32 s7, s18, s15 +; GFX9-NEXT: s_cselect_b32 s6, s17, s14 ; GFX9-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3] ; GFX9-NEXT: s_xor_b64 s[4:5], s[6:7], s[2:3] ; GFX9-NEXT: s_sub_u32 s2, s4, s2 ; GFX9-NEXT: s_subb_u32 s3, s5, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-NEXT: v_mov_b32_e32 v2, s15 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -9089,10 +9056,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_addc_u32 s13, 0, s14 ; GFX6-NEXT: s_add_u32 s14, s0, s1 ; GFX6-NEXT: v_mov_b32_e32 v0, s14 -; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0 +; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_cmp_lg_u32 s0, 0 ; GFX6-NEXT: s_addc_u32 s12, s12, s13 ; GFX6-NEXT: s_mul_i32 s0, s10, s12 ; GFX6-NEXT: v_readfirstlane_b32 s1, v0 @@ -9123,7 +9089,6 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_add_u32 s13, s14, s0 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_cmp_lg_u32 s0, 0 ; GFX6-NEXT: s_addc_u32 s12, s12, s10 ; GFX6-NEXT: s_ashr_i32 s10, s7, 31 ; GFX6-NEXT: s_add_u32 s0, s6, s10 @@ -9158,46 +9123,43 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: v_readfirstlane_b32 s5, v0 ; GFX6-NEXT: s_add_i32 s4, s5, s4 ; GFX6-NEXT: s_mul_i32 s5, s9, s12 -; GFX6-NEXT: s_add_i32 s13, s4, s5 -; GFX6-NEXT: s_sub_i32 s14, s7, s13 +; GFX6-NEXT: s_add_i32 s14, s4, s5 +; GFX6-NEXT: s_sub_i32 s13, s7, s14 ; GFX6-NEXT: s_mul_i32 s4, s8, s12 ; GFX6-NEXT: s_sub_u32 s6, s6, s4 ; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX6-NEXT: s_or_b32 s12, s4, s5 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_subb_u32 s14, s14, s9 -; GFX6-NEXT: s_sub_u32 s15, s6, s8 -; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX6-NEXT: s_subb_u32 s15, s13, s9 +; GFX6-NEXT: s_sub_u32 s16, s6, s8 +; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX6-NEXT: s_or_b32 s17, s12, s13 +; GFX6-NEXT: s_subb_u32 s17, s15, 0 +; GFX6-NEXT: s_cmp_ge_u32 s17, s9 +; GFX6-NEXT: s_cselect_b32 s18, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s16, s8 +; GFX6-NEXT: s_cselect_b32 s19, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s17, s9 +; GFX6-NEXT: s_cselect_b32 s18, s19, s18 +; GFX6-NEXT: s_or_b32 s12, s12, s13 +; GFX6-NEXT: s_subb_u32 s15, s15, s9 +; GFX6-NEXT: s_sub_u32 s19, s16, s8 +; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX6-NEXT: s_or_b32 s12, s12, s13 +; GFX6-NEXT: s_subb_u32 s12, s15, 0 +; GFX6-NEXT: s_cmp_lg_u32 s18, 0 +; GFX6-NEXT: s_cselect_b32 s13, s19, s16 +; GFX6-NEXT: s_cselect_b32 s12, s12, s17 ; GFX6-NEXT: s_or_b32 s4, s4, s5 -; GFX6-NEXT: s_cmp_lg_u32 s4, 0 -; GFX6-NEXT: s_subb_u32 s16, s14, 0 -; GFX6-NEXT: s_cmp_ge_u32 s16, s9 +; GFX6-NEXT: s_subb_u32 s4, s7, s14 +; GFX6-NEXT: s_cmp_ge_u32 s4, s9 ; GFX6-NEXT: s_cselect_b32 s5, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s15, s8 -; GFX6-NEXT: s_cselect_b32 s17, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s16, s9 -; GFX6-NEXT: s_cselect_b32 s17, s17, s5 -; GFX6-NEXT: s_cmp_lg_u32 s4, 0 -; GFX6-NEXT: s_subb_u32 s14, s14, s9 -; GFX6-NEXT: s_sub_u32 s18, s15, s8 -; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX6-NEXT: s_or_b32 s4, s4, s5 -; GFX6-NEXT: s_cmp_lg_u32 s4, 0 -; GFX6-NEXT: s_subb_u32 s4, s14, 0 -; GFX6-NEXT: s_cmp_lg_u32 s17, 0 -; GFX6-NEXT: s_cselect_b32 s14, s18, s15 -; GFX6-NEXT: s_cselect_b32 s4, s4, s16 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_subb_u32 s5, s7, s13 -; GFX6-NEXT: s_cmp_ge_u32 s5, s9 -; GFX6-NEXT: s_cselect_b32 s7, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s6, s8 -; GFX6-NEXT: s_cselect_b32 s8, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s5, s9 -; GFX6-NEXT: s_cselect_b32 s7, s8, s7 -; GFX6-NEXT: s_cmp_lg_u32 s7, 0 -; GFX6-NEXT: s_cselect_b32 s5, s4, s5 -; GFX6-NEXT: s_cselect_b32 s4, s14, s6 +; GFX6-NEXT: s_cselect_b32 s7, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s4, s9 +; GFX6-NEXT: s_cselect_b32 s5, s7, s5 +; GFX6-NEXT: s_cmp_lg_u32 s5, 0 +; GFX6-NEXT: s_cselect_b32 s5, s12, s4 +; GFX6-NEXT: s_cselect_b32 s4, s13, s6 ; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11] ; GFX6-NEXT: s_sub_u32 s4, s4, s10 ; GFX6-NEXT: s_subb_u32 s5, s5, s10 @@ -9219,8 +9181,8 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_sub_u32 s8, 0, s6 -; GFX9-NEXT: s_subb_u32 s9, 0, s7 +; GFX9-NEXT: s_sub_u32 s4, 0, s6 +; GFX9-NEXT: s_subb_u32 s5, 0, s7 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -9230,56 +9192,52 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s10, v2 -; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: s_mul_i32 s5, s8, s10 -; GFX9-NEXT: s_mul_hi_u32 s12, s8, s4 -; GFX9-NEXT: s_mul_i32 s11, s9, s4 -; GFX9-NEXT: s_add_i32 s5, s12, s5 -; GFX9-NEXT: s_mul_i32 s13, s8, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s11 -; GFX9-NEXT: s_mul_hi_u32 s12, s4, s13 -; GFX9-NEXT: s_mul_i32 s14, s4, s5 -; GFX9-NEXT: s_mul_hi_u32 s11, s4, s5 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v1 +; GFX9-NEXT: s_mul_i32 s10, s4, s8 +; GFX9-NEXT: s_mul_hi_u32 s12, s4, s9 +; GFX9-NEXT: s_mul_i32 s11, s5, s9 +; GFX9-NEXT: s_add_i32 s10, s12, s10 +; GFX9-NEXT: s_mul_i32 s13, s4, s9 +; GFX9-NEXT: s_add_i32 s10, s10, s11 +; GFX9-NEXT: s_mul_hi_u32 s12, s9, s13 +; GFX9-NEXT: s_mul_i32 s14, s9, s10 +; GFX9-NEXT: s_mul_hi_u32 s11, s9, s10 ; GFX9-NEXT: s_add_u32 s12, s12, s14 ; GFX9-NEXT: s_addc_u32 s11, 0, s11 -; GFX9-NEXT: s_mul_hi_u32 s15, s10, s13 -; GFX9-NEXT: s_mul_i32 s13, s10, s13 +; GFX9-NEXT: s_mul_hi_u32 s15, s8, s13 +; GFX9-NEXT: s_mul_i32 s13, s8, s13 ; GFX9-NEXT: s_add_u32 s12, s12, s13 -; GFX9-NEXT: s_mul_hi_u32 s14, s10, s5 +; GFX9-NEXT: s_mul_hi_u32 s14, s8, s10 ; GFX9-NEXT: s_addc_u32 s11, s11, s15 ; GFX9-NEXT: s_addc_u32 s12, s14, 0 -; GFX9-NEXT: s_mul_i32 s5, s10, s5 -; GFX9-NEXT: s_add_u32 s5, s11, s5 +; GFX9-NEXT: s_mul_i32 s10, s8, s10 +; GFX9-NEXT: s_add_u32 s10, s11, s10 ; GFX9-NEXT: s_addc_u32 s11, 0, s12 -; GFX9-NEXT: s_add_u32 s12, s4, s5 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_addc_u32 s10, s10, s11 -; GFX9-NEXT: s_mul_i32 s4, s8, s10 -; GFX9-NEXT: s_mul_hi_u32 s5, s8, s12 -; GFX9-NEXT: s_add_i32 s4, s5, s4 -; GFX9-NEXT: s_mul_i32 s9, s9, s12 -; GFX9-NEXT: s_add_i32 s4, s4, s9 -; GFX9-NEXT: s_mul_i32 s8, s8, s12 -; GFX9-NEXT: s_mul_hi_u32 s9, s10, s8 -; GFX9-NEXT: s_mul_i32 s11, s10, s8 -; GFX9-NEXT: s_mul_i32 s14, s12, s4 -; GFX9-NEXT: s_mul_hi_u32 s8, s12, s8 -; GFX9-NEXT: s_mul_hi_u32 s13, s12, s4 -; GFX9-NEXT: s_add_u32 s8, s8, s14 +; GFX9-NEXT: s_add_u32 s9, s9, s10 +; GFX9-NEXT: s_addc_u32 s8, s8, s11 +; GFX9-NEXT: s_mul_i32 s10, s4, s8 +; GFX9-NEXT: s_mul_hi_u32 s11, s4, s9 +; GFX9-NEXT: s_add_i32 s10, s11, s10 +; GFX9-NEXT: s_mul_i32 s5, s5, s9 +; GFX9-NEXT: s_add_i32 s10, s10, s5 +; GFX9-NEXT: s_mul_i32 s4, s4, s9 +; GFX9-NEXT: s_mul_hi_u32 s11, s8, s4 +; GFX9-NEXT: s_mul_i32 s12, s8, s4 +; GFX9-NEXT: s_mul_i32 s14, s9, s10 +; GFX9-NEXT: s_mul_hi_u32 s4, s9, s4 +; GFX9-NEXT: s_mul_hi_u32 s13, s9, s10 +; GFX9-NEXT: s_add_u32 s4, s4, s14 ; GFX9-NEXT: s_addc_u32 s13, 0, s13 -; GFX9-NEXT: s_add_u32 s8, s8, s11 -; GFX9-NEXT: s_mul_hi_u32 s5, s10, s4 -; GFX9-NEXT: s_addc_u32 s8, s13, s9 +; GFX9-NEXT: s_add_u32 s4, s4, s12 +; GFX9-NEXT: s_mul_hi_u32 s5, s8, s10 +; GFX9-NEXT: s_addc_u32 s4, s13, s11 ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: s_mul_i32 s4, s10, s4 -; GFX9-NEXT: s_add_u32 s4, s8, s4 -; GFX9-NEXT: s_addc_u32 s8, 0, s5 -; GFX9-NEXT: s_add_u32 s9, s12, s4 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_addc_u32 s8, s10, s8 +; GFX9-NEXT: s_mul_i32 s10, s8, s10 +; GFX9-NEXT: s_add_u32 s4, s4, s10 +; GFX9-NEXT: s_addc_u32 s5, 0, s5 +; GFX9-NEXT: s_add_u32 s9, s9, s4 +; GFX9-NEXT: s_addc_u32 s8, s8, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 ; GFX9-NEXT: s_add_u32 s2, s2, s4 @@ -9309,11 +9267,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: s_mul_i32 s8, s6, s8 ; GFX9-NEXT: s_sub_u32 s2, s2, s8 ; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX9-NEXT: s_subb_u32 s13, s10, s7 ; GFX9-NEXT: s_sub_u32 s14, s2, s6 ; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 ; GFX9-NEXT: s_subb_u32 s15, s13, 0 ; GFX9-NEXT: s_cmp_ge_u32 s15, s7 ; GFX9-NEXT: s_cselect_b32 s16, -1, 0 @@ -9322,13 +9278,11 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: s_cmp_eq_u32 s15, s7 ; GFX9-NEXT: s_cselect_b32 s16, s17, s16 ; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s13, s13, s7 -; GFX9-NEXT: s_sub_u32 s17, s14, s6 -; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s10, s13, 0 +; GFX9-NEXT: s_subb_u32 s10, s13, s7 +; GFX9-NEXT: s_sub_u32 s11, s14, s6 +; GFX9-NEXT: s_subb_u32 s10, s10, 0 ; GFX9-NEXT: s_cmp_lg_u32 s16, 0 -; GFX9-NEXT: s_cselect_b32 s11, s17, s14 +; GFX9-NEXT: s_cselect_b32 s11, s11, s14 ; GFX9-NEXT: s_cselect_b32 s10, s10, s15 ; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX9-NEXT: s_subb_u32 s3, s3, s12 @@ -9490,10 +9444,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_addc_u32 s15, 0, s16 ; GFX6-NEXT: s_add_u32 s16, s6, s7 ; GFX6-NEXT: v_mov_b32_e32 v0, s16 -; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 +; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX6-NEXT: s_or_b32 s6, s6, s7 -; GFX6-NEXT: s_cmp_lg_u32 s6, 0 ; GFX6-NEXT: s_addc_u32 s14, s14, s15 ; GFX6-NEXT: s_mul_i32 s6, s12, s14 ; GFX6-NEXT: v_readfirstlane_b32 s7, v0 @@ -9524,7 +9477,6 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_add_u32 s13, s16, s6 ; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX6-NEXT: s_or_b32 s6, s6, s7 -; GFX6-NEXT: s_cmp_lg_u32 s6, 0 ; GFX6-NEXT: s_addc_u32 s12, s14, s12 ; GFX6-NEXT: s_ashr_i32 s6, s9, 31 ; GFX6-NEXT: s_add_u32 s8, s8, s6 @@ -9557,49 +9509,46 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_readfirstlane_b32 s14, v0 ; GFX6-NEXT: s_add_i32 s13, s14, s13 ; GFX6-NEXT: s_mul_i32 s14, s3, s12 -; GFX6-NEXT: s_add_i32 s14, s13, s14 -; GFX6-NEXT: s_sub_i32 s15, s9, s14 +; GFX6-NEXT: s_add_i32 s16, s13, s14 +; GFX6-NEXT: s_sub_i32 s14, s9, s16 ; GFX6-NEXT: s_mul_i32 s12, s2, s12 ; GFX6-NEXT: s_sub_u32 s8, s8, s12 ; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s16, s12, s13 -; GFX6-NEXT: s_cmp_lg_u32 s16, 0 -; GFX6-NEXT: s_subb_u32 s15, s15, s3 -; GFX6-NEXT: s_sub_u32 s17, s8, s2 -; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_subb_u32 s18, s15, 0 -; GFX6-NEXT: s_cmp_ge_u32 s18, s3 -; GFX6-NEXT: s_cselect_b32 s13, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s17, s2 -; GFX6-NEXT: s_cselect_b32 s19, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s18, s3 -; GFX6-NEXT: s_cselect_b32 s19, s19, s13 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_subb_u32 s15, s15, s3 -; GFX6-NEXT: s_sub_u32 s20, s17, s2 -; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX6-NEXT: s_or_b32 s15, s12, s13 +; GFX6-NEXT: s_subb_u32 s17, s14, s3 +; GFX6-NEXT: s_sub_u32 s18, s8, s2 +; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX6-NEXT: s_or_b32 s19, s14, s15 +; GFX6-NEXT: s_subb_u32 s19, s17, 0 +; GFX6-NEXT: s_cmp_ge_u32 s19, s3 +; GFX6-NEXT: s_cselect_b32 s20, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s18, s2 +; GFX6-NEXT: s_cselect_b32 s21, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s19, s3 +; GFX6-NEXT: s_cselect_b32 s20, s21, s20 +; GFX6-NEXT: s_or_b32 s14, s14, s15 +; GFX6-NEXT: s_subb_u32 s17, s17, s3 +; GFX6-NEXT: s_sub_u32 s21, s18, s2 +; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX6-NEXT: s_or_b32 s14, s14, s15 +; GFX6-NEXT: s_subb_u32 s14, s17, 0 +; GFX6-NEXT: s_cmp_lg_u32 s20, 0 +; GFX6-NEXT: s_cselect_b32 s15, s21, s18 +; GFX6-NEXT: s_cselect_b32 s14, s14, s19 ; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_subb_u32 s12, s15, 0 -; GFX6-NEXT: s_cmp_lg_u32 s19, 0 -; GFX6-NEXT: s_cselect_b32 s13, s20, s17 -; GFX6-NEXT: s_cselect_b32 s12, s12, s18 -; GFX6-NEXT: s_cmp_lg_u32 s16, 0 -; GFX6-NEXT: s_subb_u32 s9, s9, s14 +; GFX6-NEXT: s_subb_u32 s9, s9, s16 ; GFX6-NEXT: s_cmp_ge_u32 s9, s3 -; GFX6-NEXT: s_cselect_b32 s14, -1, 0 +; GFX6-NEXT: s_cselect_b32 s12, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s8, s2 ; GFX6-NEXT: s_cselect_b32 s2, -1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s9, s3 -; GFX6-NEXT: s_cselect_b32 s2, s2, s14 +; GFX6-NEXT: s_cselect_b32 s2, s2, s12 ; GFX6-NEXT: s_cmp_lg_u32 s2, 0 -; GFX6-NEXT: s_cselect_b32 s3, s12, s9 -; GFX6-NEXT: s_cselect_b32 s2, s13, s8 +; GFX6-NEXT: s_cselect_b32 s3, s14, s9 +; GFX6-NEXT: s_cselect_b32 s2, s15, s8 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7] -; GFX6-NEXT: s_sub_u32 s12, s2, s6 -; GFX6-NEXT: s_subb_u32 s13, s3, s6 +; GFX6-NEXT: s_sub_u32 s14, s2, s6 +; GFX6-NEXT: s_subb_u32 s15, s3, s6 ; GFX6-NEXT: s_ashr_i32 s2, s1, 31 ; GFX6-NEXT: s_add_u32 s0, s0, s2 ; GFX6-NEXT: s_mov_b32 s3, s2 @@ -9618,40 +9567,39 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_mul_hi_u32 v2, s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s14, v1 +; GFX6-NEXT: v_readfirstlane_b32 s12, v1 ; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s1, s8, s14 +; GFX6-NEXT: s_mul_i32 s1, s8, s12 ; GFX6-NEXT: v_readfirstlane_b32 s3, v2 ; GFX6-NEXT: s_mul_i32 s0, s9, s2 ; GFX6-NEXT: s_add_i32 s1, s3, s1 ; GFX6-NEXT: s_add_i32 s3, s1, s0 -; GFX6-NEXT: s_mul_i32 s15, s8, s2 +; GFX6-NEXT: s_mul_i32 s13, s8, s2 ; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3 -; GFX6-NEXT: v_mul_hi_u32 v0, v0, s15 +; GFX6-NEXT: v_mul_hi_u32 v0, v0, s13 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mul_i32 s4, s2, s3 ; GFX6-NEXT: v_readfirstlane_b32 s5, v2 ; GFX6-NEXT: v_readfirstlane_b32 s16, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, v1, s15 +; GFX6-NEXT: v_mul_hi_u32 v0, v1, s13 ; GFX6-NEXT: v_mul_hi_u32 v1, v1, s3 ; GFX6-NEXT: s_add_u32 s4, s16, s4 ; GFX6-NEXT: s_addc_u32 s5, 0, s5 -; GFX6-NEXT: s_mul_i32 s15, s14, s15 +; GFX6-NEXT: s_mul_i32 s13, s12, s13 ; GFX6-NEXT: v_readfirstlane_b32 s16, v0 -; GFX6-NEXT: s_add_u32 s4, s4, s15 +; GFX6-NEXT: s_add_u32 s4, s4, s13 ; GFX6-NEXT: s_addc_u32 s4, s5, s16 ; GFX6-NEXT: v_readfirstlane_b32 s5, v1 ; GFX6-NEXT: s_addc_u32 s5, s5, 0 -; GFX6-NEXT: s_mul_i32 s3, s14, s3 +; GFX6-NEXT: s_mul_i32 s3, s12, s3 ; GFX6-NEXT: s_add_u32 s3, s4, s3 ; GFX6-NEXT: s_addc_u32 s4, 0, s5 ; GFX6-NEXT: s_add_u32 s5, s2, s3 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 +; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_cmp_lg_u32 s2, 0 -; GFX6-NEXT: s_addc_u32 s4, s14, s4 +; GFX6-NEXT: s_addc_u32 s4, s12, s4 ; GFX6-NEXT: s_mul_i32 s2, s8, s4 ; GFX6-NEXT: v_readfirstlane_b32 s3, v0 ; GFX6-NEXT: s_add_i32 s2, s3, s2 @@ -9665,102 +9613,98 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX6-NEXT: s_mul_i32 s9, s5, s2 -; GFX6-NEXT: v_readfirstlane_b32 s15, v2 -; GFX6-NEXT: s_add_u32 s9, s15, s9 -; GFX6-NEXT: v_readfirstlane_b32 s14, v0 +; GFX6-NEXT: v_readfirstlane_b32 s13, v2 +; GFX6-NEXT: s_add_u32 s9, s13, s9 +; GFX6-NEXT: v_readfirstlane_b32 s12, v0 ; GFX6-NEXT: s_mul_i32 s3, s4, s3 -; GFX6-NEXT: s_addc_u32 s14, 0, s14 +; GFX6-NEXT: s_addc_u32 s12, 0, s12 ; GFX6-NEXT: v_readfirstlane_b32 s8, v3 ; GFX6-NEXT: s_add_u32 s3, s9, s3 -; GFX6-NEXT: s_addc_u32 s3, s14, s8 +; GFX6-NEXT: s_addc_u32 s3, s12, s8 ; GFX6-NEXT: v_readfirstlane_b32 s8, v1 ; GFX6-NEXT: s_addc_u32 s8, s8, 0 ; GFX6-NEXT: s_mul_i32 s2, s4, s2 ; GFX6-NEXT: s_add_u32 s2, s3, s2 ; GFX6-NEXT: s_addc_u32 s8, 0, s8 -; GFX6-NEXT: s_add_u32 s14, s5, s2 +; GFX6-NEXT: s_add_u32 s12, s5, s2 ; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_cmp_lg_u32 s2, 0 -; GFX6-NEXT: s_addc_u32 s15, s4, s8 +; GFX6-NEXT: s_addc_u32 s13, s4, s8 ; GFX6-NEXT: s_ashr_i32 s4, s11, 31 ; GFX6-NEXT: s_add_u32 s2, s10, s4 ; GFX6-NEXT: s_mov_b32 s5, s4 ; GFX6-NEXT: s_addc_u32 s3, s11, s4 ; GFX6-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v0, s15 +; GFX6-NEXT: v_mov_b32_e32 v0, s13 ; GFX6-NEXT: v_mul_hi_u32 v1, s8, v0 -; GFX6-NEXT: v_mov_b32_e32 v2, s14 +; GFX6-NEXT: v_mov_b32_e32 v2, s12 ; GFX6-NEXT: v_mul_hi_u32 v3, s8, v2 -; GFX6-NEXT: s_mul_i32 s2, s8, s15 +; GFX6-NEXT: s_mul_i32 s2, s8, s13 ; GFX6-NEXT: v_readfirstlane_b32 s10, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s9, v2 ; GFX6-NEXT: v_readfirstlane_b32 s11, v3 ; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0 ; GFX6-NEXT: s_add_u32 s2, s11, s2 ; GFX6-NEXT: s_addc_u32 s10, 0, s10 -; GFX6-NEXT: s_mul_i32 s11, s9, s14 -; GFX6-NEXT: v_readfirstlane_b32 s14, v1 +; GFX6-NEXT: s_mul_i32 s11, s9, s12 +; GFX6-NEXT: v_readfirstlane_b32 s12, v1 ; GFX6-NEXT: s_add_u32 s2, s2, s11 -; GFX6-NEXT: s_addc_u32 s2, s10, s14 +; GFX6-NEXT: s_addc_u32 s2, s10, s12 ; GFX6-NEXT: v_readfirstlane_b32 s10, v0 ; GFX6-NEXT: s_addc_u32 s10, s10, 0 -; GFX6-NEXT: s_mul_i32 s11, s9, s15 +; GFX6-NEXT: s_mul_i32 s11, s9, s13 ; GFX6-NEXT: s_add_u32 s11, s2, s11 ; GFX6-NEXT: v_mov_b32_e32 v0, s11 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX6-NEXT: s_addc_u32 s10, 0, s10 ; GFX6-NEXT: s_mul_i32 s10, s6, s10 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: v_readfirstlane_b32 s14, v0 -; GFX6-NEXT: s_add_i32 s10, s14, s10 -; GFX6-NEXT: s_mul_i32 s14, s7, s11 -; GFX6-NEXT: s_add_i32 s14, s10, s14 -; GFX6-NEXT: s_sub_i32 s15, s9, s14 +; GFX6-NEXT: v_readfirstlane_b32 s12, v0 +; GFX6-NEXT: s_add_i32 s10, s12, s10 +; GFX6-NEXT: s_mul_i32 s12, s7, s11 +; GFX6-NEXT: s_add_i32 s16, s10, s12 +; GFX6-NEXT: s_sub_i32 s12, s9, s16 ; GFX6-NEXT: s_mul_i32 s10, s6, s11 ; GFX6-NEXT: s_sub_u32 s8, s8, s10 ; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX6-NEXT: s_or_b32 s16, s10, s11 -; GFX6-NEXT: s_cmp_lg_u32 s16, 0 -; GFX6-NEXT: s_subb_u32 s15, s15, s7 -; GFX6-NEXT: s_sub_u32 s17, s8, s6 -; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX6-NEXT: s_or_b32 s10, s10, s11 -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 -; GFX6-NEXT: s_subb_u32 s18, s15, 0 -; GFX6-NEXT: s_cmp_ge_u32 s18, s7 -; GFX6-NEXT: s_cselect_b32 s11, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s17, s6 -; GFX6-NEXT: s_cselect_b32 s19, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s18, s7 -; GFX6-NEXT: s_cselect_b32 s19, s19, s11 -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 -; GFX6-NEXT: s_subb_u32 s15, s15, s7 -; GFX6-NEXT: s_sub_u32 s20, s17, s6 -; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX6-NEXT: s_or_b32 s13, s10, s11 +; GFX6-NEXT: s_subb_u32 s17, s12, s7 +; GFX6-NEXT: s_sub_u32 s18, s8, s6 +; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX6-NEXT: s_or_b32 s19, s12, s13 +; GFX6-NEXT: s_subb_u32 s19, s17, 0 +; GFX6-NEXT: s_cmp_ge_u32 s19, s7 +; GFX6-NEXT: s_cselect_b32 s20, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s18, s6 +; GFX6-NEXT: s_cselect_b32 s21, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s19, s7 +; GFX6-NEXT: s_cselect_b32 s20, s21, s20 +; GFX6-NEXT: s_or_b32 s12, s12, s13 +; GFX6-NEXT: s_subb_u32 s17, s17, s7 +; GFX6-NEXT: s_sub_u32 s21, s18, s6 +; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX6-NEXT: s_or_b32 s12, s12, s13 +; GFX6-NEXT: s_subb_u32 s12, s17, 0 +; GFX6-NEXT: s_cmp_lg_u32 s20, 0 +; GFX6-NEXT: s_cselect_b32 s13, s21, s18 +; GFX6-NEXT: s_cselect_b32 s12, s12, s19 ; GFX6-NEXT: s_or_b32 s10, s10, s11 -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 -; GFX6-NEXT: s_subb_u32 s10, s15, 0 -; GFX6-NEXT: s_cmp_lg_u32 s19, 0 -; GFX6-NEXT: s_cselect_b32 s11, s20, s17 -; GFX6-NEXT: s_cselect_b32 s10, s10, s18 -; GFX6-NEXT: s_cmp_lg_u32 s16, 0 -; GFX6-NEXT: s_subb_u32 s9, s9, s14 +; GFX6-NEXT: s_subb_u32 s9, s9, s16 ; GFX6-NEXT: s_cmp_ge_u32 s9, s7 -; GFX6-NEXT: s_cselect_b32 s14, -1, 0 +; GFX6-NEXT: s_cselect_b32 s10, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s8, s6 ; GFX6-NEXT: s_cselect_b32 s6, -1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s9, s7 -; GFX6-NEXT: s_cselect_b32 s6, s6, s14 +; GFX6-NEXT: s_cselect_b32 s6, s6, s10 ; GFX6-NEXT: s_cmp_lg_u32 s6, 0 -; GFX6-NEXT: s_cselect_b32 s7, s10, s9 -; GFX6-NEXT: s_cselect_b32 s6, s11, s8 +; GFX6-NEXT: s_cselect_b32 s7, s12, s9 +; GFX6-NEXT: s_cselect_b32 s6, s13, s8 ; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] ; GFX6-NEXT: s_sub_u32 s5, s6, s4 ; GFX6-NEXT: s_subb_u32 s4, s7, s4 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_mov_b32_e32 v0, s12 -; GFX6-NEXT: v_mov_b32_e32 v1, s13 +; GFX6-NEXT: v_mov_b32_e32 v0, s14 +; GFX6-NEXT: v_mov_b32_e32 v1, s15 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: v_mov_b32_e32 v3, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9780,8 +9724,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX9-NEXT: s_sub_u32 s12, 0, s2 -; GFX9-NEXT: s_subb_u32 s13, 0, s3 +; GFX9-NEXT: s_sub_u32 s6, 0, s2 +; GFX9-NEXT: s_subb_u32 s7, 0, s3 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -9790,56 +9734,52 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s14, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: s_mul_i32 s7, s12, s14 -; GFX9-NEXT: s_mul_hi_u32 s16, s12, s6 -; GFX9-NEXT: s_mul_i32 s15, s13, s6 -; GFX9-NEXT: s_add_i32 s7, s16, s7 -; GFX9-NEXT: s_mul_i32 s17, s12, s6 -; GFX9-NEXT: s_add_i32 s7, s7, s15 -; GFX9-NEXT: s_mul_hi_u32 s16, s6, s17 -; GFX9-NEXT: s_mul_i32 s18, s6, s7 -; GFX9-NEXT: s_mul_hi_u32 s15, s6, s7 +; GFX9-NEXT: v_readfirstlane_b32 s12, v1 +; GFX9-NEXT: v_readfirstlane_b32 s13, v0 +; GFX9-NEXT: s_mul_i32 s14, s6, s12 +; GFX9-NEXT: s_mul_hi_u32 s16, s6, s13 +; GFX9-NEXT: s_mul_i32 s15, s7, s13 +; GFX9-NEXT: s_add_i32 s14, s16, s14 +; GFX9-NEXT: s_mul_i32 s17, s6, s13 +; GFX9-NEXT: s_add_i32 s14, s14, s15 +; GFX9-NEXT: s_mul_hi_u32 s16, s13, s17 +; GFX9-NEXT: s_mul_i32 s18, s13, s14 +; GFX9-NEXT: s_mul_hi_u32 s15, s13, s14 ; GFX9-NEXT: s_add_u32 s16, s16, s18 ; GFX9-NEXT: s_addc_u32 s15, 0, s15 -; GFX9-NEXT: s_mul_hi_u32 s18, s14, s17 -; GFX9-NEXT: s_mul_i32 s17, s14, s17 +; GFX9-NEXT: s_mul_hi_u32 s18, s12, s17 +; GFX9-NEXT: s_mul_i32 s17, s12, s17 ; GFX9-NEXT: s_add_u32 s16, s16, s17 -; GFX9-NEXT: s_mul_hi_u32 s19, s14, s7 +; GFX9-NEXT: s_mul_hi_u32 s19, s12, s14 ; GFX9-NEXT: s_addc_u32 s15, s15, s18 ; GFX9-NEXT: s_addc_u32 s16, s19, 0 -; GFX9-NEXT: s_mul_i32 s7, s14, s7 -; GFX9-NEXT: s_add_u32 s7, s15, s7 +; GFX9-NEXT: s_mul_i32 s14, s12, s14 +; GFX9-NEXT: s_add_u32 s14, s15, s14 ; GFX9-NEXT: s_addc_u32 s15, 0, s16 -; GFX9-NEXT: s_add_u32 s16, s6, s7 -; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9-NEXT: s_addc_u32 s14, s14, s15 -; GFX9-NEXT: s_mul_i32 s6, s12, s14 -; GFX9-NEXT: s_mul_hi_u32 s7, s12, s16 -; GFX9-NEXT: s_add_i32 s6, s7, s6 -; GFX9-NEXT: s_mul_i32 s13, s13, s16 -; GFX9-NEXT: s_add_i32 s6, s6, s13 -; GFX9-NEXT: s_mul_i32 s12, s12, s16 -; GFX9-NEXT: s_mul_hi_u32 s13, s14, s12 -; GFX9-NEXT: s_mul_i32 s15, s14, s12 -; GFX9-NEXT: s_mul_i32 s18, s16, s6 -; GFX9-NEXT: s_mul_hi_u32 s12, s16, s12 -; GFX9-NEXT: s_mul_hi_u32 s17, s16, s6 -; GFX9-NEXT: s_add_u32 s12, s12, s18 +; GFX9-NEXT: s_add_u32 s13, s13, s14 +; GFX9-NEXT: s_addc_u32 s12, s12, s15 +; GFX9-NEXT: s_mul_i32 s14, s6, s12 +; GFX9-NEXT: s_mul_hi_u32 s15, s6, s13 +; GFX9-NEXT: s_add_i32 s14, s15, s14 +; GFX9-NEXT: s_mul_i32 s7, s7, s13 +; GFX9-NEXT: s_add_i32 s14, s14, s7 +; GFX9-NEXT: s_mul_i32 s6, s6, s13 +; GFX9-NEXT: s_mul_hi_u32 s15, s12, s6 +; GFX9-NEXT: s_mul_i32 s16, s12, s6 +; GFX9-NEXT: s_mul_i32 s18, s13, s14 +; GFX9-NEXT: s_mul_hi_u32 s6, s13, s6 +; GFX9-NEXT: s_mul_hi_u32 s17, s13, s14 +; GFX9-NEXT: s_add_u32 s6, s6, s18 ; GFX9-NEXT: s_addc_u32 s17, 0, s17 -; GFX9-NEXT: s_add_u32 s12, s12, s15 -; GFX9-NEXT: s_mul_hi_u32 s7, s14, s6 -; GFX9-NEXT: s_addc_u32 s12, s17, s13 +; GFX9-NEXT: s_add_u32 s6, s6, s16 +; GFX9-NEXT: s_mul_hi_u32 s7, s12, s14 +; GFX9-NEXT: s_addc_u32 s6, s17, s15 ; GFX9-NEXT: s_addc_u32 s7, s7, 0 -; GFX9-NEXT: s_mul_i32 s6, s14, s6 -; GFX9-NEXT: s_add_u32 s6, s12, s6 -; GFX9-NEXT: s_addc_u32 s12, 0, s7 -; GFX9-NEXT: s_add_u32 s13, s16, s6 -; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9-NEXT: s_addc_u32 s12, s14, s12 +; GFX9-NEXT: s_mul_i32 s14, s12, s14 +; GFX9-NEXT: s_add_u32 s6, s6, s14 +; GFX9-NEXT: s_addc_u32 s7, 0, s7 +; GFX9-NEXT: s_add_u32 s13, s13, s6 +; GFX9-NEXT: s_addc_u32 s12, s12, s7 ; GFX9-NEXT: s_ashr_i32 s6, s9, 31 ; GFX9-NEXT: s_add_u32 s8, s8, s6 ; GFX9-NEXT: s_mov_b32 s7, s6 @@ -9868,11 +9808,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_mul_i32 s12, s2, s12 ; GFX9-NEXT: s_sub_u32 s8, s8, s12 ; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 ; GFX9-NEXT: s_subb_u32 s17, s14, s3 ; GFX9-NEXT: s_sub_u32 s18, s8, s2 ; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0 ; GFX9-NEXT: s_subb_u32 s19, s17, 0 ; GFX9-NEXT: s_cmp_ge_u32 s19, s3 ; GFX9-NEXT: s_cselect_b32 s20, -1, 0 @@ -9881,13 +9819,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_cmp_eq_u32 s19, s3 ; GFX9-NEXT: s_cselect_b32 s20, s21, s20 ; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0 -; GFX9-NEXT: s_subb_u32 s17, s17, s3 -; GFX9-NEXT: s_sub_u32 s21, s18, s2 -; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0 -; GFX9-NEXT: s_subb_u32 s14, s17, 0 +; GFX9-NEXT: s_subb_u32 s14, s17, s3 +; GFX9-NEXT: s_sub_u32 s15, s18, s2 +; GFX9-NEXT: s_subb_u32 s14, s14, 0 ; GFX9-NEXT: s_cmp_lg_u32 s20, 0 -; GFX9-NEXT: s_cselect_b32 s15, s21, s18 +; GFX9-NEXT: s_cselect_b32 s15, s15, s18 ; GFX9-NEXT: s_cselect_b32 s14, s14, s19 ; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 ; GFX9-NEXT: s_subb_u32 s9, s9, s16 @@ -9911,8 +9847,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: s_sub_u32 s6, 0, s2 -; GFX9-NEXT: s_subb_u32 s7, 0, s3 +; GFX9-NEXT: s_sub_u32 s4, 0, s2 +; GFX9-NEXT: s_subb_u32 s5, 0, s3 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -9922,74 +9858,70 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v1 ; GFX9-NEXT: v_readfirstlane_b32 s9, v2 -; GFX9-NEXT: s_mul_hi_u32 s8, s6, s4 -; GFX9-NEXT: s_mul_i32 s14, s6, s9 -; GFX9-NEXT: s_mul_i32 s5, s7, s4 +; GFX9-NEXT: s_mul_hi_u32 s8, s4, s6 +; GFX9-NEXT: s_mul_i32 s14, s4, s9 +; GFX9-NEXT: s_mul_i32 s7, s5, s6 ; GFX9-NEXT: s_add_i32 s8, s8, s14 -; GFX9-NEXT: s_add_i32 s8, s8, s5 -; GFX9-NEXT: s_mul_i32 s15, s6, s4 -; GFX9-NEXT: s_mul_i32 s14, s4, s8 -; GFX9-NEXT: s_mul_hi_u32 s16, s4, s15 -; GFX9-NEXT: s_mul_hi_u32 s5, s4, s8 +; GFX9-NEXT: s_add_i32 s8, s8, s7 +; GFX9-NEXT: s_mul_i32 s15, s4, s6 +; GFX9-NEXT: s_mul_i32 s14, s6, s8 +; GFX9-NEXT: s_mul_hi_u32 s16, s6, s15 +; GFX9-NEXT: s_mul_hi_u32 s7, s6, s8 ; GFX9-NEXT: s_add_u32 s14, s16, s14 -; GFX9-NEXT: s_addc_u32 s5, 0, s5 +; GFX9-NEXT: s_addc_u32 s7, 0, s7 ; GFX9-NEXT: s_mul_hi_u32 s17, s9, s15 ; GFX9-NEXT: s_mul_i32 s15, s9, s15 ; GFX9-NEXT: s_add_u32 s14, s14, s15 ; GFX9-NEXT: s_mul_hi_u32 s16, s9, s8 -; GFX9-NEXT: s_addc_u32 s5, s5, s17 +; GFX9-NEXT: s_addc_u32 s7, s7, s17 ; GFX9-NEXT: s_addc_u32 s14, s16, 0 ; GFX9-NEXT: s_mul_i32 s8, s9, s8 -; GFX9-NEXT: s_add_u32 s5, s5, s8 +; GFX9-NEXT: s_add_u32 s7, s7, s8 ; GFX9-NEXT: s_addc_u32 s8, 0, s14 -; GFX9-NEXT: s_add_u32 s14, s4, s5 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_addc_u32 s8, s9, s8 -; GFX9-NEXT: s_mul_i32 s4, s6, s8 -; GFX9-NEXT: s_mul_hi_u32 s5, s6, s14 -; GFX9-NEXT: s_add_i32 s4, s5, s4 -; GFX9-NEXT: s_mul_i32 s7, s7, s14 -; GFX9-NEXT: s_add_i32 s4, s4, s7 -; GFX9-NEXT: s_mul_i32 s6, s6, s14 -; GFX9-NEXT: s_mul_hi_u32 s7, s8, s6 -; GFX9-NEXT: s_mul_i32 s9, s8, s6 -; GFX9-NEXT: s_mul_i32 s16, s14, s4 -; GFX9-NEXT: s_mul_hi_u32 s6, s14, s6 -; GFX9-NEXT: s_mul_hi_u32 s15, s14, s4 -; GFX9-NEXT: s_add_u32 s6, s6, s16 +; GFX9-NEXT: s_add_u32 s6, s6, s7 +; GFX9-NEXT: s_addc_u32 s7, s9, s8 +; GFX9-NEXT: s_mul_i32 s8, s4, s7 +; GFX9-NEXT: s_mul_hi_u32 s9, s4, s6 +; GFX9-NEXT: s_add_i32 s8, s9, s8 +; GFX9-NEXT: s_mul_i32 s5, s5, s6 +; GFX9-NEXT: s_add_i32 s8, s8, s5 +; GFX9-NEXT: s_mul_i32 s4, s4, s6 +; GFX9-NEXT: s_mul_hi_u32 s9, s7, s4 +; GFX9-NEXT: s_mul_i32 s14, s7, s4 +; GFX9-NEXT: s_mul_i32 s16, s6, s8 +; GFX9-NEXT: s_mul_hi_u32 s4, s6, s4 +; GFX9-NEXT: s_mul_hi_u32 s15, s6, s8 +; GFX9-NEXT: s_add_u32 s4, s4, s16 ; GFX9-NEXT: s_addc_u32 s15, 0, s15 -; GFX9-NEXT: s_add_u32 s6, s6, s9 -; GFX9-NEXT: s_mul_hi_u32 s5, s8, s4 -; GFX9-NEXT: s_addc_u32 s6, s15, s7 +; GFX9-NEXT: s_add_u32 s4, s4, s14 +; GFX9-NEXT: s_mul_hi_u32 s5, s7, s8 +; GFX9-NEXT: s_addc_u32 s4, s15, s9 ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: s_mul_i32 s4, s8, s4 -; GFX9-NEXT: s_add_u32 s4, s6, s4 -; GFX9-NEXT: s_addc_u32 s6, 0, s5 -; GFX9-NEXT: s_add_u32 s9, s14, s4 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_addc_u32 s8, s8, s6 +; GFX9-NEXT: s_mul_i32 s8, s7, s8 +; GFX9-NEXT: s_add_u32 s4, s4, s8 +; GFX9-NEXT: s_addc_u32 s5, 0, s5 +; GFX9-NEXT: s_add_u32 s8, s6, s4 +; GFX9-NEXT: s_addc_u32 s9, s7, s5 ; GFX9-NEXT: s_ashr_i32 s4, s11, 31 ; GFX9-NEXT: s_add_u32 s6, s10, s4 ; GFX9-NEXT: s_mov_b32 s5, s4 ; GFX9-NEXT: s_addc_u32 s7, s11, s4 ; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] -; GFX9-NEXT: s_mul_i32 s11, s6, s8 -; GFX9-NEXT: s_mul_hi_u32 s14, s6, s9 -; GFX9-NEXT: s_mul_hi_u32 s10, s6, s8 +; GFX9-NEXT: s_mul_i32 s11, s6, s9 +; GFX9-NEXT: s_mul_hi_u32 s14, s6, s8 +; GFX9-NEXT: s_mul_hi_u32 s10, s6, s9 ; GFX9-NEXT: s_add_u32 s11, s14, s11 ; GFX9-NEXT: s_addc_u32 s10, 0, s10 -; GFX9-NEXT: s_mul_hi_u32 s15, s7, s9 -; GFX9-NEXT: s_mul_i32 s9, s7, s9 -; GFX9-NEXT: s_add_u32 s9, s11, s9 -; GFX9-NEXT: s_mul_hi_u32 s14, s7, s8 -; GFX9-NEXT: s_addc_u32 s9, s10, s15 -; GFX9-NEXT: s_addc_u32 s10, s14, 0 +; GFX9-NEXT: s_mul_hi_u32 s15, s7, s8 ; GFX9-NEXT: s_mul_i32 s8, s7, s8 -; GFX9-NEXT: s_add_u32 s8, s9, s8 +; GFX9-NEXT: s_add_u32 s8, s11, s8 +; GFX9-NEXT: s_mul_hi_u32 s14, s7, s9 +; GFX9-NEXT: s_addc_u32 s8, s10, s15 +; GFX9-NEXT: s_addc_u32 s10, s14, 0 +; GFX9-NEXT: s_mul_i32 s9, s7, s9 +; GFX9-NEXT: s_add_u32 s8, s8, s9 ; GFX9-NEXT: s_addc_u32 s9, 0, s10 ; GFX9-NEXT: s_mul_i32 s9, s2, s9 ; GFX9-NEXT: s_mul_hi_u32 s10, s2, s8 @@ -10000,11 +9932,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_mul_i32 s8, s2, s8 ; GFX9-NEXT: s_sub_u32 s6, s6, s8 ; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX9-NEXT: s_subb_u32 s15, s10, s3 ; GFX9-NEXT: s_sub_u32 s16, s6, s2 ; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 ; GFX9-NEXT: s_subb_u32 s17, s15, 0 ; GFX9-NEXT: s_cmp_ge_u32 s17, s3 ; GFX9-NEXT: s_cselect_b32 s18, -1, 0 @@ -10013,13 +9943,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_cmp_eq_u32 s17, s3 ; GFX9-NEXT: s_cselect_b32 s18, s19, s18 ; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s15, s15, s3 -; GFX9-NEXT: s_sub_u32 s19, s16, s2 -; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s10, s15, 0 +; GFX9-NEXT: s_subb_u32 s10, s15, s3 +; GFX9-NEXT: s_sub_u32 s11, s16, s2 +; GFX9-NEXT: s_subb_u32 s10, s10, 0 ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cselect_b32 s11, s19, s16 +; GFX9-NEXT: s_cselect_b32 s11, s11, s16 ; GFX9-NEXT: s_cselect_b32 s10, s10, s17 ; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX9-NEXT: s_subb_u32 s7, s7, s14 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll index 394727c..01f4414 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -612,12 +612,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8-NEXT: s_add_i32 s2, s2, s8 +; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -653,12 +652,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s3 -; GFX9-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9-NEXT: s_add_i32 s2, s2, s8 +; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -693,11 +691,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 -; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 -; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: s_add_i32 s2, s2, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -733,11 +730,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 -; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 +; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 -; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -774,11 +770,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: s_add_i32 s2, s2, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -818,11 +813,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 -; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -859,11 +853,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -901,15 +894,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -999,12 +992,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8-NEXT: s_add_i32 s2, s2, s8 +; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1042,12 +1034,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s3 -; GFX9-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9-NEXT: s_add_i32 s2, s2, s8 +; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1084,11 +1075,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 -; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 -; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: s_add_i32 s2, s2, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1127,11 +1117,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 -; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 +; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 -; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1171,11 +1160,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: s_add_i32 s2, s2, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1218,11 +1206,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 -; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1261,11 +1248,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX12W64-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1306,15 +1292,15 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB3_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -2073,12 +2059,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8-NEXT: s_add_i32 s2, s2, s8 +; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2114,12 +2099,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s3 -; GFX9-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9-NEXT: s_add_i32 s2, s2, s8 +; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2154,11 +2138,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 -; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 -; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: s_add_i32 s2, s2, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2194,11 +2177,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 -; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 +; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 -; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2235,11 +2217,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: s_add_i32 s2, s2, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -2279,11 +2260,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 -; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -2321,11 +2301,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -2363,15 +2342,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 258bc295..9db6d70 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -717,12 +717,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s2 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 -; GFX8_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 +; GFX8_ITERATIVE-NEXT: s_add_i32 s6, s6, s3 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -762,12 +761,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s2 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 -; GFX9_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 +; GFX9_ITERATIVE-NEXT: s_add_i32 s6, s6, s3 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -805,13 +803,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1064_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s7, s[0:1] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s7 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s7 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s2 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1064_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064_ITERATIVE-NEXT: s_add_i32 s6, s6, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -853,11 +850,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s1 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s3 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032_ITERATIVE-NEXT: s_add_i32 s6, s6, s2 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -897,14 +893,13 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1164_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v1, s2 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -949,11 +944,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1132_ITERATIVE-NEXT: s_add_i32 s6, s6, s2 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -993,14 +987,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1264_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] +; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1] +; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v1, s2 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2 +; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s7 ; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7 -; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 ; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8 -; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -1028,6 +1022,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xf1ff ; GFX1264_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null @@ -1041,15 +1036,15 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1232_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1 -; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 -; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3 +; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s2 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -2363,7 +2358,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -2416,7 +2410,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -2462,13 +2455,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s2 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s2 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s2 ; GFX1064_ITERATIVE-NEXT: s_add_u32 s6, s6, s3 ; GFX1064_ITERATIVE-NEXT: s_addc_u32 s7, s7, s8 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -2515,13 +2507,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s1 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s1 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s1 ; GFX1032_ITERATIVE-NEXT: s_add_u32 s6, s6, s2 ; GFX1032_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -2569,14 +2560,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s2 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s2 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2 ; GFX1164_ITERATIVE-NEXT: s_add_u32 s6, s6, s3 ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s7, s7, s8 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -2626,14 +2616,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 ; GFX1132_ITERATIVE-NEXT: s_add_u32 s6, s6, s2 ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -2677,16 +2666,16 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1264_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1] +; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[0:1] +; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s8 +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s8 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s8 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s8 +; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s10 -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s10 -; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s10 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s10 ; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[8:9] ; GFX1264_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3] -; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -2731,17 +2720,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1 -; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 -; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8 -; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3] +; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3] ; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -4490,12 +4479,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s2 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s12, m0 -; GFX8_ITERATIVE-NEXT: s_add_i32 s12, s12, s6 +; GFX8_ITERATIVE-NEXT: s_add_i32 s12, s12, s3 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -4550,12 +4538,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s2 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s12, m0 -; GFX9_ITERATIVE-NEXT: s_add_i32 s12, s12, s6 +; GFX9_ITERATIVE-NEXT: s_add_i32 s12, s12, s3 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -4608,13 +4595,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s6 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s6 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s2 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1064_ITERATIVE-NEXT: s_add_i32 s12, s12, s6 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064_ITERATIVE-NEXT: s_add_i32 s12, s12, s7 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -4670,11 +4656,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s8, s1 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s3 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032_ITERATIVE-NEXT: s_add_i32 s8, s8, s2 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -4728,14 +4713,13 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2 ; GFX1164_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s6 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s6 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s2 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164_ITERATIVE-NEXT: s_add_i32 s12, s12, s6 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164_ITERATIVE-NEXT: s_add_i32 s12, s12, s7 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -4799,11 +4783,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v2, s8, s1 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1132_ITERATIVE-NEXT: s_add_i32 s8, s8, s2 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -4861,14 +4844,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1264_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] +; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1] +; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v1, s2 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2 +; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s7 ; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7 -; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 ; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8 -; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -4896,6 +4879,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xf1ff ; GFX1264_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null @@ -4909,15 +4893,15 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1232_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1 -; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 -; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3 +; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s2 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -6673,7 +6657,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -6746,7 +6729,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -6812,13 +6794,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s2 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s2 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s2 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s2 ; GFX1064_ITERATIVE-NEXT: s_add_u32 s8, s8, s3 ; GFX1064_ITERATIVE-NEXT: s_addc_u32 s9, s9, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -6883,13 +6864,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s1 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s1 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s1 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s1 ; GFX1032_ITERATIVE-NEXT: s_add_u32 s8, s8, s2 ; GFX1032_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -6955,14 +6935,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s2 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s2 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s2 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s2 ; GFX1164_ITERATIVE-NEXT: s_add_u32 s8, s8, s3 ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s9, s9, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -7036,14 +7015,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s1 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s1 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s1 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s1 ; GFX1132_ITERATIVE-NEXT: s_add_u32 s8, s8, s2 ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -7109,16 +7087,16 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1264_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1] +; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[0:1] +; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s8 +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s8 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s8 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s8 +; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s10 -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s10 -; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s10 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s10 ; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[8:9] ; GFX1264_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3] -; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -7163,17 +7141,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1 -; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 -; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8 -; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3] +; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3] ; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 23c5f4f..6167a84 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -499,12 +499,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -540,12 +539,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -580,11 +578,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -621,11 +618,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -663,11 +659,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -707,11 +702,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1088,11 +1082,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX8_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3 +; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s4 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1117,11 +1110,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX9_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3 +; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s4 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1147,9 +1139,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1176,9 +1167,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1206,10 +1196,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] ; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1239,10 +1227,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2022,7 +2008,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2071,7 +2056,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2112,13 +2096,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6 ; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 ; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2160,13 +2143,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 ; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2209,14 +2191,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -2261,14 +2242,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -2881,7 +2861,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2914,7 +2893,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2946,7 +2924,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2979,7 +2956,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3013,8 +2989,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3048,9 +3022,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s4 ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3906,12 +3879,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3947,12 +3919,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3987,11 +3958,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4028,11 +3998,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4070,11 +4039,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -4114,11 +4082,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -4495,11 +4462,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX8_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3 +; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s4 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4524,11 +4490,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX9_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3 +; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s4 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4554,9 +4519,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4583,9 +4547,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4613,10 +4576,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] ; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4646,10 +4607,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5452,7 +5411,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5501,7 +5459,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5542,13 +5499,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6 ; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 ; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5590,13 +5546,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 ; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5639,14 +5594,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -5691,14 +5645,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -6313,12 +6266,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: s_and_b32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6354,12 +6306,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: s_and_b32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6394,11 +6345,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6435,11 +6385,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1032_ITERATIVE-NEXT: s_and_b32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6477,11 +6426,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -6521,11 +6469,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1132_ITERATIVE-NEXT: s_and_b32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -6926,12 +6873,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX8_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6973,12 +6919,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX9_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7015,15 +6960,14 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s8 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX1064_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7065,12 +7009,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7109,16 +7052,15 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX1164_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -7163,12 +7105,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1132_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -7672,12 +7613,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: s_or_b32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7713,12 +7653,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: s_or_b32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7753,11 +7692,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7794,11 +7732,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1032_ITERATIVE-NEXT: s_or_b32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7836,11 +7773,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -7880,11 +7816,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1132_ITERATIVE-NEXT: s_or_b32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -8284,12 +8219,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX8_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8331,12 +8265,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX9_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8373,15 +8306,14 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s8 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX1064_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8423,12 +8355,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8467,16 +8398,15 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX1164_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -8521,12 +8451,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1132_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9030,12 +8959,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: s_xor_b32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9071,12 +8999,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: s_xor_b32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9111,11 +9038,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9152,11 +9078,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1032_ITERATIVE-NEXT: s_xor_b32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9194,11 +9119,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -9238,11 +9162,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1132_ITERATIVE-NEXT: s_xor_b32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -9642,12 +9565,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9689,12 +9611,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9731,15 +9652,14 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s8 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9781,12 +9701,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9825,16 +9744,15 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9879,12 +9797,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1132_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -10388,12 +10305,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: s_max_i32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10429,12 +10345,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: s_max_i32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10469,11 +10384,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10510,11 +10424,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1032_ITERATIVE-NEXT: s_max_i32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10552,11 +10465,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -10596,11 +10508,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1132_ITERATIVE-NEXT: s_max_i32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -11255,7 +11166,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11311,7 +11221,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11363,7 +11272,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11415,7 +11323,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11468,9 +11375,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -11525,9 +11431,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -12214,12 +12119,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: s_min_i32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -12255,12 +12159,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: s_min_i32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -12295,11 +12198,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -12336,11 +12238,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1032_ITERATIVE-NEXT: s_min_i32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -12378,11 +12279,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -12422,11 +12322,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1132_ITERATIVE-NEXT: s_min_i32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -13081,7 +12980,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -13137,7 +13035,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -13189,7 +13086,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -13241,7 +13137,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -13294,9 +13189,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -13351,9 +13245,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -14040,12 +13933,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: s_max_u32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14081,12 +13973,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: s_max_u32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14121,11 +14012,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14162,11 +14052,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1032_ITERATIVE-NEXT: s_max_u32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14204,11 +14093,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -14248,11 +14136,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1132_ITERATIVE-NEXT: s_max_u32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -14901,7 +14788,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14956,7 +14842,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -15007,7 +14892,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -15058,7 +14942,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -15112,8 +14995,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -15169,8 +15050,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -15853,12 +15732,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: s_min_u32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -15894,12 +15772,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: s_min_u32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -15934,11 +15811,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -15975,11 +15851,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1032_ITERATIVE-NEXT: s_min_u32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -16017,11 +15892,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -16061,11 +15935,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1132_ITERATIVE-NEXT: s_min_u32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -16715,7 +16588,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -16770,7 +16642,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -16821,7 +16692,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -16872,7 +16742,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -16926,8 +16795,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -16983,8 +16850,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll index e4def28..9afc0c6 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -611,12 +611,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8-NEXT: s_add_i32 s2, s2, s8 +; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -652,12 +651,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s3 -; GFX9-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9-NEXT: s_add_i32 s2, s2, s8 +; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -692,11 +690,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 -; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 -; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: s_add_i32 s2, s2, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -732,11 +729,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 -; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 +; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 -; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -773,11 +769,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: s_add_i32 s2, s2, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -817,11 +812,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 -; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -858,11 +852,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -900,15 +893,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1665,12 +1658,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8-NEXT: s_add_i32 s2, s2, s8 +; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1706,12 +1698,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s3 -; GFX9-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9-NEXT: s_add_i32 s2, s2, s8 +; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1746,11 +1737,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 -; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 -; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: s_add_i32 s2, s2, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1786,11 +1776,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 -; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 +; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 -; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1827,11 +1816,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: s_add_i32 s2, s2, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1871,11 +1859,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 -; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1913,11 +1900,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX12W64-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1955,15 +1941,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB6_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll index 39a3c9a..10fd34f 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -628,12 +628,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8-NEXT: s_add_i32 s2, s2, s8 +; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -670,12 +669,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s3 -; GFX9-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9-NEXT: s_add_i32 s2, s2, s8 +; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -711,11 +709,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 -; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 -; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: s_add_i32 s2, s2, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -752,11 +749,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 -; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 +; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 -; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -794,11 +790,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: s_add_i32 s2, s2, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -839,11 +834,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 -; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -880,11 +874,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -923,15 +916,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1833,12 +1826,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8-NEXT: s_add_i32 s2, s2, s8 +; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1875,12 +1867,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s3 -; GFX9-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9-NEXT: s_add_i32 s2, s2, s8 +; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1916,11 +1907,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 -; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 -; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: s_add_i32 s2, s2, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1957,11 +1947,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 -; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 +; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 -; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1999,11 +1988,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: s_add_i32 s2, s2, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -2044,11 +2032,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 -; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -2086,11 +2073,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -2129,15 +2115,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll index 4a6fa4f..b96de17 100644 --- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll +++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll @@ -704,7 +704,6 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; CISI-NEXT: s_add_u32 s4, s4, s6 ; CISI-NEXT: s_cselect_b64 s[12:13], -1, 0 ; CISI-NEXT: s_or_b32 s6, s12, s13 -; CISI-NEXT: s_cmp_lg_u32 s6, 0 ; CISI-NEXT: s_addc_u32 s5, s5, s7 ; CISI-NEXT: s_mov_b32 s8, s0 ; CISI-NEXT: s_mov_b32 s9, s1 @@ -725,16 +724,14 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: s_add_u32 s2, s4, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_add_u32 s0, s4, s6 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_addc_u32 s1, s5, s7 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 -; VI-NEXT: s_addc_u32 s0, s5, s7 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: v_mov_b32_e32 v5, s0 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -746,12 +743,10 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s2, s12, s14 -; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_addc_u32 s0, s13, s15 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_add_u32 s0, s12, s14 +; GFX9-NEXT: s_addc_u32 s1, s13, s15 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -764,10 +759,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-NEXT: s_add_u32 s0, s12, s14 -; GFX1010-NEXT: s_cselect_b32 s1, -1, 0 -; GFX1010-NEXT: v_mov_b32_e32 v0, s0 -; GFX1010-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1010-NEXT: s_addc_u32 s1, s13, s15 +; GFX1010-NEXT: v_mov_b32_e32 v0, s0 ; GFX1010-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1010-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 @@ -781,10 +774,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_add_u32 s4, s4, s6 -; GFX1030W32-NEXT: s_cselect_b32 s6, -1, 0 -; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030W32-NEXT: s_cmp_lg_u32 s6, 0 ; GFX1030W32-NEXT: s_addc_u32 s5, s5, s7 +; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030W32-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1030W32-NEXT: v_mov_b32_e32 v1, s5 ; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 @@ -798,10 +789,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_add_u32 s4, s4, s6 -; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX1030W64-NEXT: s_addc_u32 s5, s5, s7 +; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030W64-NEXT: v_mov_b32_e32 v1, s5 ; GFX1030W64-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] @@ -814,10 +803,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s4, s4, s6 -; GFX11-NEXT: s_cselect_b32 s6, -1, 0 -; GFX11-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-NEXT: s_cmp_lg_u32 s6, 0 ; GFX11-NEXT: s_addc_u32 s5, s5, s7 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-NEXT: s_cselect_b32 s4, -1, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 @@ -831,10 +818,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_add_co_u32 s0, s12, s14 -; GFX1250-NEXT: s_cselect_b32 s1, -1, 0 -; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0 -; GFX1250-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1250-NEXT: s_add_co_ci_u32 s1, s13, s15 +; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0 ; GFX1250-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 @@ -1691,7 +1676,6 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; CISI-NEXT: s_sub_u32 s4, s4, s6 ; CISI-NEXT: s_cselect_b64 s[12:13], -1, 0 ; CISI-NEXT: s_or_b32 s6, s12, s13 -; CISI-NEXT: s_cmp_lg_u32 s6, 0 ; CISI-NEXT: s_subb_u32 s5, s5, s7 ; CISI-NEXT: s_mov_b32 s8, s0 ; CISI-NEXT: s_mov_b32 s9, s1 @@ -1712,16 +1696,14 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: s_sub_u32 s2, s4, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_sub_u32 s0, s4, s6 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_subb_u32 s1, s5, s7 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 -; VI-NEXT: s_subb_u32 s0, s5, s7 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: v_mov_b32_e32 v5, s0 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -1733,12 +1715,10 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_u32 s2, s12, s14 -; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_subb_u32 s0, s13, s15 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_sub_u32 s0, s12, s14 +; GFX9-NEXT: s_subb_u32 s1, s13, s15 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -1751,10 +1731,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-NEXT: s_sub_u32 s0, s12, s14 -; GFX1010-NEXT: s_cselect_b32 s1, -1, 0 -; GFX1010-NEXT: v_mov_b32_e32 v0, s0 -; GFX1010-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1010-NEXT: s_subb_u32 s1, s13, s15 +; GFX1010-NEXT: v_mov_b32_e32 v0, s0 ; GFX1010-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1010-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 @@ -1768,10 +1746,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_sub_u32 s4, s4, s6 -; GFX1030W32-NEXT: s_cselect_b32 s6, -1, 0 -; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030W32-NEXT: s_cmp_lg_u32 s6, 0 ; GFX1030W32-NEXT: s_subb_u32 s5, s5, s7 +; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030W32-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1030W32-NEXT: v_mov_b32_e32 v1, s5 ; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 @@ -1785,10 +1761,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_sub_u32 s4, s4, s6 -; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX1030W64-NEXT: s_subb_u32 s5, s5, s7 +; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030W64-NEXT: v_mov_b32_e32 v1, s5 ; GFX1030W64-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] @@ -1801,10 +1775,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_u32 s4, s4, s6 -; GFX11-NEXT: s_cselect_b32 s6, -1, 0 -; GFX11-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-NEXT: s_cmp_lg_u32 s6, 0 ; GFX11-NEXT: s_subb_u32 s5, s5, s7 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-NEXT: s_cselect_b32 s4, -1, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 @@ -1818,10 +1790,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_sub_co_u32 s0, s12, s14 -; GFX1250-NEXT: s_cselect_b32 s1, -1, 0 -; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0 -; GFX1250-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1250-NEXT: s_sub_co_ci_u32 s1, s13, s15 +; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0 ; GFX1250-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 @@ -2218,49 +2188,46 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; VI-NEXT: s_addc_u32 s6, s7, s9 ; VI-NEXT: s_addc_u32 s8, s8, 0 ; VI-NEXT: v_readfirstlane_b32 s7, v0 -; VI-NEXT: s_add_u32 s12, s6, s7 -; VI-NEXT: v_mov_b32_e32 v0, s12 +; VI-NEXT: s_add_u32 s10, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s10 ; VI-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s4, v0, 0 -; VI-NEXT: s_addc_u32 s13, 0, s8 -; VI-NEXT: s_mul_i32 s8, s4, s13 +; VI-NEXT: s_addc_u32 s11, 0, s8 +; VI-NEXT: s_mul_i32 s8, s4, s11 ; VI-NEXT: v_readfirstlane_b32 s9, v1 ; VI-NEXT: s_add_i32 s8, s9, s8 -; VI-NEXT: s_mul_i32 s9, s5, s12 -; VI-NEXT: s_add_i32 s14, s8, s9 -; VI-NEXT: s_sub_i32 s10, s3, s14 +; VI-NEXT: s_mul_i32 s9, s5, s10 +; VI-NEXT: s_add_i32 s12, s8, s9 +; VI-NEXT: s_sub_i32 s13, s3, s12 ; VI-NEXT: v_readfirstlane_b32 s8, v0 -; VI-NEXT: s_sub_u32 s15, s2, s8 +; VI-NEXT: s_sub_u32 s14, s2, s8 ; VI-NEXT: s_cselect_b64 s[8:9], -1, 0 -; VI-NEXT: s_cmp_lg_u64 s[8:9], 0 -; VI-NEXT: s_subb_u32 s16, s10, s5 -; VI-NEXT: s_sub_u32 s17, s15, s4 -; VI-NEXT: s_cselect_b64 s[10:11], -1, 0 -; VI-NEXT: s_cmp_lg_u64 s[10:11], 0 -; VI-NEXT: s_subb_u32 s10, s16, 0 -; VI-NEXT: s_cmp_ge_u32 s10, s5 -; VI-NEXT: s_cselect_b32 s11, -1, 0 -; VI-NEXT: s_cmp_ge_u32 s17, s4 +; VI-NEXT: s_subb_u32 s13, s13, s5 +; VI-NEXT: s_sub_u32 s15, s14, s4 +; VI-NEXT: s_subb_u32 s13, s13, 0 +; VI-NEXT: s_cmp_ge_u32 s13, s5 ; VI-NEXT: s_cselect_b32 s16, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s10, s5 -; VI-NEXT: s_cselect_b32 s10, s16, s11 -; VI-NEXT: s_add_u32 s11, s12, 1 -; VI-NEXT: s_addc_u32 s16, s13, 0 -; VI-NEXT: s_add_u32 s17, s12, 2 -; VI-NEXT: s_addc_u32 s18, s13, 0 -; VI-NEXT: s_cmp_lg_u32 s10, 0 -; VI-NEXT: s_cselect_b32 s10, s17, s11 -; VI-NEXT: s_cselect_b32 s11, s18, s16 +; VI-NEXT: s_cmp_ge_u32 s15, s4 +; VI-NEXT: s_cselect_b32 s15, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s13, s5 +; VI-NEXT: s_cselect_b32 s13, s15, s16 +; VI-NEXT: s_add_u32 s15, s10, 1 +; VI-NEXT: s_addc_u32 s16, s11, 0 +; VI-NEXT: s_add_u32 s17, s10, 2 +; VI-NEXT: s_addc_u32 s18, s11, 0 +; VI-NEXT: s_cmp_lg_u32 s13, 0 +; VI-NEXT: s_cselect_b32 s13, s17, s15 +; VI-NEXT: s_cselect_b32 s15, s18, s16 ; VI-NEXT: s_cmp_lg_u64 s[8:9], 0 -; VI-NEXT: s_subb_u32 s3, s3, s14 +; VI-NEXT: s_subb_u32 s3, s3, s12 ; VI-NEXT: s_cmp_ge_u32 s3, s5 ; VI-NEXT: s_cselect_b32 s8, -1, 0 -; VI-NEXT: s_cmp_ge_u32 s15, s4 +; VI-NEXT: s_cmp_ge_u32 s14, s4 ; VI-NEXT: s_cselect_b32 s9, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s3, s5 ; VI-NEXT: s_cselect_b32 s3, s9, s8 ; VI-NEXT: s_cmp_lg_u32 s3, 0 -; VI-NEXT: s_cselect_b32 s9, s11, s13 -; VI-NEXT: s_cselect_b32 s8, s10, s12 +; VI-NEXT: s_cselect_b32 s9, s15, s11 +; VI-NEXT: s_cselect_b32 s8, s13, s10 ; VI-NEXT: s_cbranch_execnz .LBB16_4 ; VI-NEXT: .LBB16_2: ; VI-NEXT: v_cvt_f32_u32_e32 v0, s4 @@ -2311,8 +2278,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_sub_u32 s10, 0, s6 -; GFX9-NEXT: s_subb_u32 s11, 0, s7 +; GFX9-NEXT: s_sub_u32 s8, 0, s6 +; GFX9-NEXT: s_subb_u32 s9, 0, s7 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2321,109 +2288,102 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX9-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s12, v1 -; GFX9-NEXT: v_readfirstlane_b32 s8, v0 -; GFX9-NEXT: s_mul_i32 s9, s10, s12 -; GFX9-NEXT: s_mul_hi_u32 s14, s10, s8 -; GFX9-NEXT: s_mul_i32 s13, s11, s8 -; GFX9-NEXT: s_add_i32 s9, s14, s9 -; GFX9-NEXT: s_add_i32 s9, s9, s13 -; GFX9-NEXT: s_mul_i32 s15, s10, s8 -; GFX9-NEXT: s_mul_i32 s14, s8, s9 -; GFX9-NEXT: s_mul_hi_u32 s16, s8, s15 -; GFX9-NEXT: s_mul_hi_u32 s13, s8, s9 +; GFX9-NEXT: v_readfirstlane_b32 s10, v1 +; GFX9-NEXT: v_readfirstlane_b32 s11, v0 +; GFX9-NEXT: s_mul_i32 s12, s8, s10 +; GFX9-NEXT: s_mul_hi_u32 s14, s8, s11 +; GFX9-NEXT: s_mul_i32 s13, s9, s11 +; GFX9-NEXT: s_add_i32 s12, s14, s12 +; GFX9-NEXT: s_add_i32 s12, s12, s13 +; GFX9-NEXT: s_mul_i32 s15, s8, s11 +; GFX9-NEXT: s_mul_i32 s14, s11, s12 +; GFX9-NEXT: s_mul_hi_u32 s16, s11, s15 +; GFX9-NEXT: s_mul_hi_u32 s13, s11, s12 ; GFX9-NEXT: s_add_u32 s14, s16, s14 ; GFX9-NEXT: s_addc_u32 s13, 0, s13 -; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15 -; GFX9-NEXT: s_mul_i32 s15, s12, s15 +; GFX9-NEXT: s_mul_hi_u32 s17, s10, s15 +; GFX9-NEXT: s_mul_i32 s15, s10, s15 ; GFX9-NEXT: s_add_u32 s14, s14, s15 -; GFX9-NEXT: s_mul_hi_u32 s16, s12, s9 +; GFX9-NEXT: s_mul_hi_u32 s16, s10, s12 ; GFX9-NEXT: s_addc_u32 s13, s13, s17 ; GFX9-NEXT: s_addc_u32 s14, s16, 0 -; GFX9-NEXT: s_mul_i32 s9, s12, s9 -; GFX9-NEXT: s_add_u32 s9, s13, s9 +; GFX9-NEXT: s_mul_i32 s12, s10, s12 +; GFX9-NEXT: s_add_u32 s12, s13, s12 ; GFX9-NEXT: s_addc_u32 s13, 0, s14 -; GFX9-NEXT: s_add_u32 s14, s8, s9 -; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 -; GFX9-NEXT: s_addc_u32 s12, s12, s13 -; GFX9-NEXT: s_mul_i32 s8, s10, s12 -; GFX9-NEXT: s_mul_hi_u32 s9, s10, s14 -; GFX9-NEXT: s_add_i32 s8, s9, s8 -; GFX9-NEXT: s_mul_i32 s11, s11, s14 -; GFX9-NEXT: s_add_i32 s8, s8, s11 -; GFX9-NEXT: s_mul_i32 s10, s10, s14 -; GFX9-NEXT: s_mul_hi_u32 s11, s12, s10 -; GFX9-NEXT: s_mul_i32 s13, s12, s10 -; GFX9-NEXT: s_mul_i32 s16, s14, s8 -; GFX9-NEXT: s_mul_hi_u32 s10, s14, s10 -; GFX9-NEXT: s_mul_hi_u32 s15, s14, s8 -; GFX9-NEXT: s_add_u32 s10, s10, s16 +; GFX9-NEXT: s_add_u32 s11, s11, s12 +; GFX9-NEXT: s_addc_u32 s10, s10, s13 +; GFX9-NEXT: s_mul_i32 s12, s8, s10 +; GFX9-NEXT: s_mul_hi_u32 s13, s8, s11 +; GFX9-NEXT: s_add_i32 s12, s13, s12 +; GFX9-NEXT: s_mul_i32 s9, s9, s11 +; GFX9-NEXT: s_add_i32 s12, s12, s9 +; GFX9-NEXT: s_mul_i32 s8, s8, s11 +; GFX9-NEXT: s_mul_hi_u32 s13, s10, s8 +; GFX9-NEXT: s_mul_i32 s14, s10, s8 +; GFX9-NEXT: s_mul_i32 s16, s11, s12 +; GFX9-NEXT: s_mul_hi_u32 s8, s11, s8 +; GFX9-NEXT: s_mul_hi_u32 s15, s11, s12 +; GFX9-NEXT: s_add_u32 s8, s8, s16 ; GFX9-NEXT: s_addc_u32 s15, 0, s15 -; GFX9-NEXT: s_add_u32 s10, s10, s13 -; GFX9-NEXT: s_mul_hi_u32 s9, s12, s8 -; GFX9-NEXT: s_addc_u32 s10, s15, s11 +; GFX9-NEXT: s_add_u32 s8, s8, s14 +; GFX9-NEXT: s_mul_hi_u32 s9, s10, s12 +; GFX9-NEXT: s_addc_u32 s8, s15, s13 ; GFX9-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-NEXT: s_mul_i32 s8, s12, s8 -; GFX9-NEXT: s_add_u32 s8, s10, s8 -; GFX9-NEXT: s_addc_u32 s10, 0, s9 -; GFX9-NEXT: s_add_u32 s11, s14, s8 -; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 -; GFX9-NEXT: s_addc_u32 s8, s12, s10 -; GFX9-NEXT: s_mul_i32 s10, s2, s8 -; GFX9-NEXT: s_mul_hi_u32 s12, s2, s11 -; GFX9-NEXT: s_mul_hi_u32 s9, s2, s8 -; GFX9-NEXT: s_add_u32 s10, s12, s10 +; GFX9-NEXT: s_mul_i32 s12, s10, s12 +; GFX9-NEXT: s_add_u32 s8, s8, s12 ; GFX9-NEXT: s_addc_u32 s9, 0, s9 -; GFX9-NEXT: s_mul_hi_u32 s13, s3, s11 -; GFX9-NEXT: s_mul_i32 s11, s3, s11 -; GFX9-NEXT: s_add_u32 s10, s10, s11 -; GFX9-NEXT: s_mul_hi_u32 s12, s3, s8 -; GFX9-NEXT: s_addc_u32 s9, s9, s13 -; GFX9-NEXT: s_addc_u32 s10, s12, 0 +; GFX9-NEXT: s_add_u32 s8, s11, s8 +; GFX9-NEXT: s_addc_u32 s9, s10, s9 +; GFX9-NEXT: s_mul_i32 s11, s2, s9 +; GFX9-NEXT: s_mul_hi_u32 s12, s2, s8 +; GFX9-NEXT: s_mul_hi_u32 s10, s2, s9 +; GFX9-NEXT: s_add_u32 s11, s12, s11 +; GFX9-NEXT: s_addc_u32 s10, 0, s10 +; GFX9-NEXT: s_mul_hi_u32 s13, s3, s8 ; GFX9-NEXT: s_mul_i32 s8, s3, s8 -; GFX9-NEXT: s_add_u32 s12, s9, s8 -; GFX9-NEXT: s_addc_u32 s13, 0, s10 -; GFX9-NEXT: s_mul_i32 s8, s6, s13 -; GFX9-NEXT: s_mul_hi_u32 s9, s6, s12 +; GFX9-NEXT: s_add_u32 s8, s11, s8 +; GFX9-NEXT: s_mul_hi_u32 s12, s3, s9 +; GFX9-NEXT: s_addc_u32 s8, s10, s13 +; GFX9-NEXT: s_addc_u32 s10, s12, 0 +; GFX9-NEXT: s_mul_i32 s9, s3, s9 +; GFX9-NEXT: s_add_u32 s11, s8, s9 +; GFX9-NEXT: s_addc_u32 s10, 0, s10 +; GFX9-NEXT: s_mul_i32 s8, s6, s10 +; GFX9-NEXT: s_mul_hi_u32 s9, s6, s11 ; GFX9-NEXT: s_add_i32 s8, s9, s8 -; GFX9-NEXT: s_mul_i32 s9, s7, s12 -; GFX9-NEXT: s_add_i32 s14, s8, s9 -; GFX9-NEXT: s_sub_i32 s10, s3, s14 -; GFX9-NEXT: s_mul_i32 s8, s6, s12 -; GFX9-NEXT: s_sub_u32 s15, s2, s8 +; GFX9-NEXT: s_mul_i32 s9, s7, s11 +; GFX9-NEXT: s_add_i32 s12, s8, s9 +; GFX9-NEXT: s_sub_i32 s13, s3, s12 +; GFX9-NEXT: s_mul_i32 s8, s6, s11 +; GFX9-NEXT: s_sub_u32 s14, s2, s8 ; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 -; GFX9-NEXT: s_subb_u32 s16, s10, s7 -; GFX9-NEXT: s_sub_u32 s17, s15, s6 -; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s10, s16, 0 -; GFX9-NEXT: s_cmp_ge_u32 s10, s7 -; GFX9-NEXT: s_cselect_b32 s11, -1, 0 -; GFX9-NEXT: s_cmp_ge_u32 s17, s6 +; GFX9-NEXT: s_subb_u32 s13, s13, s7 +; GFX9-NEXT: s_sub_u32 s15, s14, s6 +; GFX9-NEXT: s_subb_u32 s13, s13, 0 +; GFX9-NEXT: s_cmp_ge_u32 s13, s7 ; GFX9-NEXT: s_cselect_b32 s16, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s10, s7 -; GFX9-NEXT: s_cselect_b32 s10, s16, s11 -; GFX9-NEXT: s_add_u32 s11, s12, 1 -; GFX9-NEXT: s_addc_u32 s16, s13, 0 -; GFX9-NEXT: s_add_u32 s17, s12, 2 -; GFX9-NEXT: s_addc_u32 s18, s13, 0 -; GFX9-NEXT: s_cmp_lg_u32 s10, 0 -; GFX9-NEXT: s_cselect_b32 s10, s17, s11 -; GFX9-NEXT: s_cselect_b32 s11, s18, s16 +; GFX9-NEXT: s_cmp_ge_u32 s15, s6 +; GFX9-NEXT: s_cselect_b32 s15, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s13, s7 +; GFX9-NEXT: s_cselect_b32 s13, s15, s16 +; GFX9-NEXT: s_add_u32 s15, s11, 1 +; GFX9-NEXT: s_addc_u32 s16, s10, 0 +; GFX9-NEXT: s_add_u32 s17, s11, 2 +; GFX9-NEXT: s_addc_u32 s18, s10, 0 +; GFX9-NEXT: s_cmp_lg_u32 s13, 0 +; GFX9-NEXT: s_cselect_b32 s13, s17, s15 +; GFX9-NEXT: s_cselect_b32 s15, s18, s16 ; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 -; GFX9-NEXT: s_subb_u32 s3, s3, s14 +; GFX9-NEXT: s_subb_u32 s3, s3, s12 ; GFX9-NEXT: s_cmp_ge_u32 s3, s7 ; GFX9-NEXT: s_cselect_b32 s8, -1, 0 -; GFX9-NEXT: s_cmp_ge_u32 s15, s6 +; GFX9-NEXT: s_cmp_ge_u32 s14, s6 ; GFX9-NEXT: s_cselect_b32 s9, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s3, s7 ; GFX9-NEXT: s_cselect_b32 s3, s9, s8 ; GFX9-NEXT: s_cmp_lg_u32 s3, 0 -; GFX9-NEXT: s_cselect_b32 s9, s11, s13 -; GFX9-NEXT: s_cselect_b32 s8, s10, s12 +; GFX9-NEXT: s_cselect_b32 s9, s15, s10 +; GFX9-NEXT: s_cselect_b32 s8, s13, s11 ; GFX9-NEXT: s_cbranch_execnz .LBB16_3 ; GFX9-NEXT: .LBB16_2: ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 @@ -2503,44 +2463,40 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: s_add_u32 s11, s12, s11 ; GFX1010-NEXT: s_addc_u32 s12, 0, s13 ; GFX1010-NEXT: s_add_u32 s8, s8, s11 -; GFX1010-NEXT: s_cselect_b32 s11, -1, 0 -; GFX1010-NEXT: s_mul_hi_u32 s13, s9, s8 -; GFX1010-NEXT: s_cmp_lg_u32 s11, 0 -; GFX1010-NEXT: s_mul_i32 s11, s9, s8 ; GFX1010-NEXT: s_addc_u32 s5, s5, s12 -; GFX1010-NEXT: s_mul_i32 s10, s10, s8 +; GFX1010-NEXT: s_mul_hi_u32 s11, s9, s8 +; GFX1010-NEXT: s_mul_i32 s12, s9, s8 ; GFX1010-NEXT: s_mul_i32 s9, s9, s5 -; GFX1010-NEXT: s_mul_hi_u32 s12, s8, s11 -; GFX1010-NEXT: s_add_i32 s9, s13, s9 -; GFX1010-NEXT: s_mul_hi_u32 s13, s5, s11 +; GFX1010-NEXT: s_mul_i32 s10, s10, s8 +; GFX1010-NEXT: s_add_i32 s9, s11, s9 +; GFX1010-NEXT: s_mul_i32 s11, s5, s12 ; GFX1010-NEXT: s_add_i32 s9, s9, s10 -; GFX1010-NEXT: s_mul_i32 s10, s5, s11 +; GFX1010-NEXT: s_mul_hi_u32 s10, s8, s12 ; GFX1010-NEXT: s_mul_i32 s15, s8, s9 ; GFX1010-NEXT: s_mul_hi_u32 s14, s8, s9 -; GFX1010-NEXT: s_add_u32 s12, s12, s15 +; GFX1010-NEXT: s_add_u32 s10, s10, s15 +; GFX1010-NEXT: s_mul_hi_u32 s13, s5, s12 ; GFX1010-NEXT: s_addc_u32 s14, 0, s14 -; GFX1010-NEXT: s_mul_hi_u32 s11, s5, s9 -; GFX1010-NEXT: s_add_u32 s10, s12, s10 +; GFX1010-NEXT: s_mul_hi_u32 s12, s5, s9 +; GFX1010-NEXT: s_add_u32 s10, s10, s11 ; GFX1010-NEXT: s_mul_i32 s9, s5, s9 ; GFX1010-NEXT: s_addc_u32 s10, s14, s13 -; GFX1010-NEXT: s_addc_u32 s11, s11, 0 +; GFX1010-NEXT: s_addc_u32 s11, s12, 0 ; GFX1010-NEXT: s_add_u32 s9, s10, s9 ; GFX1010-NEXT: s_addc_u32 s10, 0, s11 ; GFX1010-NEXT: s_add_u32 s8, s8, s9 -; GFX1010-NEXT: s_cselect_b32 s9, -1, 0 -; GFX1010-NEXT: s_mul_hi_u32 s11, s2, s8 -; GFX1010-NEXT: s_cmp_lg_u32 s9, 0 -; GFX1010-NEXT: s_mul_hi_u32 s9, s3, s8 ; GFX1010-NEXT: s_addc_u32 s5, s5, s10 -; GFX1010-NEXT: s_mul_i32 s8, s3, s8 +; GFX1010-NEXT: s_mul_hi_u32 s9, s2, s8 ; GFX1010-NEXT: s_mul_i32 s12, s2, s5 -; GFX1010-NEXT: s_mul_hi_u32 s10, s2, s5 -; GFX1010-NEXT: s_add_u32 s11, s11, s12 -; GFX1010-NEXT: s_addc_u32 s10, 0, s10 +; GFX1010-NEXT: s_mul_hi_u32 s11, s2, s5 +; GFX1010-NEXT: s_mul_hi_u32 s10, s3, s8 +; GFX1010-NEXT: s_mul_i32 s8, s3, s8 +; GFX1010-NEXT: s_add_u32 s9, s9, s12 +; GFX1010-NEXT: s_addc_u32 s11, 0, s11 ; GFX1010-NEXT: s_mul_hi_u32 s13, s3, s5 -; GFX1010-NEXT: s_add_u32 s8, s11, s8 +; GFX1010-NEXT: s_add_u32 s8, s9, s8 ; GFX1010-NEXT: s_mul_i32 s5, s3, s5 -; GFX1010-NEXT: s_addc_u32 s8, s10, s9 +; GFX1010-NEXT: s_addc_u32 s8, s11, s10 ; GFX1010-NEXT: s_addc_u32 s9, s13, 0 ; GFX1010-NEXT: s_add_u32 s5, s8, s5 ; GFX1010-NEXT: s_addc_u32 s8, 0, s9 @@ -2553,11 +2509,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: s_sub_i32 s11, s3, s9 ; GFX1010-NEXT: s_sub_u32 s10, s2, s10 ; GFX1010-NEXT: s_cselect_b32 s12, -1, 0 -; GFX1010-NEXT: s_cmp_lg_u32 s12, 0 ; GFX1010-NEXT: s_subb_u32 s11, s11, s7 ; GFX1010-NEXT: s_sub_u32 s13, s10, s6 -; GFX1010-NEXT: s_cselect_b32 s14, -1, 0 -; GFX1010-NEXT: s_cmp_lg_u32 s14, 0 ; GFX1010-NEXT: s_subb_u32 s11, s11, 0 ; GFX1010-NEXT: s_cmp_ge_u32 s11, s7 ; GFX1010-NEXT: s_cselect_b32 s14, -1, 0 @@ -2663,44 +2616,40 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W32-NEXT: s_add_u32 s11, s12, s11 ; GFX1030W32-NEXT: s_addc_u32 s12, 0, s13 ; GFX1030W32-NEXT: s_add_u32 s8, s8, s11 -; GFX1030W32-NEXT: s_cselect_b32 s11, -1, 0 -; GFX1030W32-NEXT: s_mul_hi_u32 s13, s9, s8 -; GFX1030W32-NEXT: s_cmp_lg_u32 s11, 0 -; GFX1030W32-NEXT: s_mul_i32 s11, s9, s8 ; GFX1030W32-NEXT: s_addc_u32 s7, s7, s12 -; GFX1030W32-NEXT: s_mul_i32 s10, s10, s8 +; GFX1030W32-NEXT: s_mul_hi_u32 s11, s9, s8 +; GFX1030W32-NEXT: s_mul_i32 s12, s9, s8 ; GFX1030W32-NEXT: s_mul_i32 s9, s9, s7 -; GFX1030W32-NEXT: s_mul_hi_u32 s12, s8, s11 -; GFX1030W32-NEXT: s_add_i32 s9, s13, s9 -; GFX1030W32-NEXT: s_mul_hi_u32 s13, s7, s11 +; GFX1030W32-NEXT: s_mul_i32 s10, s10, s8 +; GFX1030W32-NEXT: s_add_i32 s9, s11, s9 +; GFX1030W32-NEXT: s_mul_i32 s11, s7, s12 ; GFX1030W32-NEXT: s_add_i32 s9, s9, s10 -; GFX1030W32-NEXT: s_mul_i32 s10, s7, s11 +; GFX1030W32-NEXT: s_mul_hi_u32 s10, s8, s12 ; GFX1030W32-NEXT: s_mul_i32 s15, s8, s9 ; GFX1030W32-NEXT: s_mul_hi_u32 s14, s8, s9 -; GFX1030W32-NEXT: s_add_u32 s12, s12, s15 +; GFX1030W32-NEXT: s_add_u32 s10, s10, s15 +; GFX1030W32-NEXT: s_mul_hi_u32 s13, s7, s12 ; GFX1030W32-NEXT: s_addc_u32 s14, 0, s14 -; GFX1030W32-NEXT: s_mul_hi_u32 s11, s7, s9 -; GFX1030W32-NEXT: s_add_u32 s10, s12, s10 +; GFX1030W32-NEXT: s_mul_hi_u32 s12, s7, s9 +; GFX1030W32-NEXT: s_add_u32 s10, s10, s11 ; GFX1030W32-NEXT: s_mul_i32 s9, s7, s9 ; GFX1030W32-NEXT: s_addc_u32 s10, s14, s13 -; GFX1030W32-NEXT: s_addc_u32 s11, s11, 0 +; GFX1030W32-NEXT: s_addc_u32 s11, s12, 0 ; GFX1030W32-NEXT: s_add_u32 s9, s10, s9 ; GFX1030W32-NEXT: s_addc_u32 s10, 0, s11 ; GFX1030W32-NEXT: s_add_u32 s8, s8, s9 -; GFX1030W32-NEXT: s_cselect_b32 s9, -1, 0 -; GFX1030W32-NEXT: s_mul_hi_u32 s11, s2, s8 -; GFX1030W32-NEXT: s_cmp_lg_u32 s9, 0 -; GFX1030W32-NEXT: s_mul_hi_u32 s9, s3, s8 ; GFX1030W32-NEXT: s_addc_u32 s7, s7, s10 -; GFX1030W32-NEXT: s_mul_i32 s8, s3, s8 +; GFX1030W32-NEXT: s_mul_hi_u32 s9, s2, s8 ; GFX1030W32-NEXT: s_mul_i32 s12, s2, s7 -; GFX1030W32-NEXT: s_mul_hi_u32 s10, s2, s7 -; GFX1030W32-NEXT: s_add_u32 s11, s11, s12 -; GFX1030W32-NEXT: s_addc_u32 s10, 0, s10 +; GFX1030W32-NEXT: s_mul_hi_u32 s11, s2, s7 +; GFX1030W32-NEXT: s_mul_hi_u32 s10, s3, s8 +; GFX1030W32-NEXT: s_mul_i32 s8, s3, s8 +; GFX1030W32-NEXT: s_add_u32 s9, s9, s12 +; GFX1030W32-NEXT: s_addc_u32 s11, 0, s11 ; GFX1030W32-NEXT: s_mul_hi_u32 s13, s3, s7 -; GFX1030W32-NEXT: s_add_u32 s8, s11, s8 +; GFX1030W32-NEXT: s_add_u32 s8, s9, s8 ; GFX1030W32-NEXT: s_mul_i32 s7, s3, s7 -; GFX1030W32-NEXT: s_addc_u32 s8, s10, s9 +; GFX1030W32-NEXT: s_addc_u32 s8, s11, s10 ; GFX1030W32-NEXT: s_addc_u32 s9, s13, 0 ; GFX1030W32-NEXT: s_add_u32 s7, s8, s7 ; GFX1030W32-NEXT: s_addc_u32 s8, 0, s9 @@ -2713,11 +2662,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W32-NEXT: s_sub_i32 s11, s3, s9 ; GFX1030W32-NEXT: s_sub_u32 s10, s2, s10 ; GFX1030W32-NEXT: s_cselect_b32 s12, -1, 0 -; GFX1030W32-NEXT: s_cmp_lg_u32 s12, 0 ; GFX1030W32-NEXT: s_subb_u32 s11, s11, s5 ; GFX1030W32-NEXT: s_sub_u32 s13, s10, s4 -; GFX1030W32-NEXT: s_cselect_b32 s14, -1, 0 -; GFX1030W32-NEXT: s_cmp_lg_u32 s14, 0 ; GFX1030W32-NEXT: s_subb_u32 s11, s11, 0 ; GFX1030W32-NEXT: s_cmp_ge_u32 s11, s5 ; GFX1030W32-NEXT: s_cselect_b32 s14, -1, 0 @@ -2790,8 +2736,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W64-NEXT: ; %bb.1: ; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v1, s5 -; GFX1030W64-NEXT: s_sub_u32 s9, 0, s4 -; GFX1030W64-NEXT: s_subb_u32 s10, 0, s5 +; GFX1030W64-NEXT: s_sub_u32 s8, 0, s4 +; GFX1030W64-NEXT: s_subb_u32 s9, 0, s5 ; GFX1030W64-NEXT: v_fmamk_f32 v0, v1, 0x4f800000, v0 ; GFX1030W64-NEXT: v_rcp_f32_e32 v0, v0 ; GFX1030W64-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2800,109 +2746,102 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W64-NEXT: v_fmamk_f32 v0, v1, 0xcf800000, v0 ; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX1030W64-NEXT: v_readfirstlane_b32 s8, v1 -; GFX1030W64-NEXT: v_readfirstlane_b32 s6, v0 -; GFX1030W64-NEXT: s_mul_i32 s7, s9, s8 -; GFX1030W64-NEXT: s_mul_hi_u32 s12, s9, s6 -; GFX1030W64-NEXT: s_mul_i32 s11, s10, s6 -; GFX1030W64-NEXT: s_add_i32 s7, s12, s7 -; GFX1030W64-NEXT: s_mul_i32 s13, s9, s6 -; GFX1030W64-NEXT: s_add_i32 s7, s7, s11 -; GFX1030W64-NEXT: s_mul_hi_u32 s12, s6, s13 -; GFX1030W64-NEXT: s_mul_i32 s15, s6, s7 -; GFX1030W64-NEXT: s_mul_hi_u32 s14, s8, s13 -; GFX1030W64-NEXT: s_mul_i32 s11, s8, s13 -; GFX1030W64-NEXT: s_mul_hi_u32 s13, s6, s7 +; GFX1030W64-NEXT: v_readfirstlane_b32 s6, v1 +; GFX1030W64-NEXT: v_readfirstlane_b32 s7, v0 +; GFX1030W64-NEXT: s_mul_i32 s10, s8, s6 +; GFX1030W64-NEXT: s_mul_hi_u32 s12, s8, s7 +; GFX1030W64-NEXT: s_mul_i32 s11, s9, s7 +; GFX1030W64-NEXT: s_add_i32 s10, s12, s10 +; GFX1030W64-NEXT: s_mul_i32 s13, s8, s7 +; GFX1030W64-NEXT: s_add_i32 s10, s10, s11 +; GFX1030W64-NEXT: s_mul_hi_u32 s12, s7, s13 +; GFX1030W64-NEXT: s_mul_i32 s15, s7, s10 +; GFX1030W64-NEXT: s_mul_hi_u32 s14, s6, s13 +; GFX1030W64-NEXT: s_mul_i32 s11, s6, s13 +; GFX1030W64-NEXT: s_mul_hi_u32 s13, s7, s10 ; GFX1030W64-NEXT: s_add_u32 s12, s12, s15 ; GFX1030W64-NEXT: s_addc_u32 s13, 0, s13 -; GFX1030W64-NEXT: s_mul_hi_u32 s16, s8, s7 +; GFX1030W64-NEXT: s_mul_hi_u32 s16, s6, s10 ; GFX1030W64-NEXT: s_add_u32 s11, s12, s11 -; GFX1030W64-NEXT: s_mul_i32 s7, s8, s7 +; GFX1030W64-NEXT: s_mul_i32 s10, s6, s10 ; GFX1030W64-NEXT: s_addc_u32 s11, s13, s14 ; GFX1030W64-NEXT: s_addc_u32 s12, s16, 0 -; GFX1030W64-NEXT: s_add_u32 s7, s11, s7 +; GFX1030W64-NEXT: s_add_u32 s10, s11, s10 ; GFX1030W64-NEXT: s_addc_u32 s11, 0, s12 -; GFX1030W64-NEXT: s_add_u32 s12, s6, s7 -; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX1030W64-NEXT: s_mul_hi_u32 s13, s9, s12 -; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1030W64-NEXT: s_mul_i32 s6, s9, s12 -; GFX1030W64-NEXT: s_addc_u32 s8, s8, s11 -; GFX1030W64-NEXT: s_mul_i32 s10, s10, s12 -; GFX1030W64-NEXT: s_mul_i32 s9, s9, s8 -; GFX1030W64-NEXT: s_mul_hi_u32 s7, s12, s6 -; GFX1030W64-NEXT: s_add_i32 s9, s13, s9 -; GFX1030W64-NEXT: s_mul_hi_u32 s11, s8, s6 -; GFX1030W64-NEXT: s_add_i32 s9, s9, s10 -; GFX1030W64-NEXT: s_mul_i32 s6, s8, s6 -; GFX1030W64-NEXT: s_mul_i32 s14, s12, s9 -; GFX1030W64-NEXT: s_mul_hi_u32 s13, s12, s9 -; GFX1030W64-NEXT: s_add_u32 s7, s7, s14 +; GFX1030W64-NEXT: s_add_u32 s7, s7, s10 +; GFX1030W64-NEXT: s_addc_u32 s6, s6, s11 +; GFX1030W64-NEXT: s_mul_hi_u32 s10, s8, s7 +; GFX1030W64-NEXT: s_mul_i32 s11, s8, s7 +; GFX1030W64-NEXT: s_mul_i32 s8, s8, s6 +; GFX1030W64-NEXT: s_mul_i32 s9, s9, s7 +; GFX1030W64-NEXT: s_add_i32 s8, s10, s8 +; GFX1030W64-NEXT: s_mul_i32 s10, s6, s11 +; GFX1030W64-NEXT: s_add_i32 s8, s8, s9 +; GFX1030W64-NEXT: s_mul_hi_u32 s9, s7, s11 +; GFX1030W64-NEXT: s_mul_i32 s14, s7, s8 +; GFX1030W64-NEXT: s_mul_hi_u32 s13, s7, s8 +; GFX1030W64-NEXT: s_add_u32 s9, s9, s14 +; GFX1030W64-NEXT: s_mul_hi_u32 s12, s6, s11 ; GFX1030W64-NEXT: s_addc_u32 s13, 0, s13 -; GFX1030W64-NEXT: s_mul_hi_u32 s10, s8, s9 -; GFX1030W64-NEXT: s_add_u32 s6, s7, s6 -; GFX1030W64-NEXT: s_mul_i32 s9, s8, s9 -; GFX1030W64-NEXT: s_addc_u32 s6, s13, s11 -; GFX1030W64-NEXT: s_addc_u32 s7, s10, 0 -; GFX1030W64-NEXT: s_add_u32 s6, s6, s9 -; GFX1030W64-NEXT: s_addc_u32 s9, 0, s7 -; GFX1030W64-NEXT: s_add_u32 s10, s12, s6 -; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX1030W64-NEXT: s_mul_hi_u32 s11, s2, s10 -; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1030W64-NEXT: s_mul_hi_u32 s6, s3, s10 -; GFX1030W64-NEXT: s_addc_u32 s7, s8, s9 -; GFX1030W64-NEXT: s_mul_i32 s8, s3, s10 -; GFX1030W64-NEXT: s_mul_i32 s10, s2, s7 -; GFX1030W64-NEXT: s_mul_hi_u32 s9, s2, s7 -; GFX1030W64-NEXT: s_add_u32 s10, s11, s10 -; GFX1030W64-NEXT: s_addc_u32 s9, 0, s9 -; GFX1030W64-NEXT: s_mul_hi_u32 s12, s3, s7 -; GFX1030W64-NEXT: s_add_u32 s8, s10, s8 +; GFX1030W64-NEXT: s_mul_hi_u32 s11, s6, s8 +; GFX1030W64-NEXT: s_add_u32 s9, s9, s10 +; GFX1030W64-NEXT: s_mul_i32 s8, s6, s8 +; GFX1030W64-NEXT: s_addc_u32 s9, s13, s12 +; GFX1030W64-NEXT: s_addc_u32 s10, s11, 0 +; GFX1030W64-NEXT: s_add_u32 s8, s9, s8 +; GFX1030W64-NEXT: s_addc_u32 s9, 0, s10 +; GFX1030W64-NEXT: s_add_u32 s7, s7, s8 +; GFX1030W64-NEXT: s_addc_u32 s6, s6, s9 +; GFX1030W64-NEXT: s_mul_hi_u32 s8, s2, s7 +; GFX1030W64-NEXT: s_mul_i32 s11, s2, s6 +; GFX1030W64-NEXT: s_mul_hi_u32 s10, s2, s6 +; GFX1030W64-NEXT: s_mul_hi_u32 s9, s3, s7 ; GFX1030W64-NEXT: s_mul_i32 s7, s3, s7 -; GFX1030W64-NEXT: s_addc_u32 s6, s9, s6 +; GFX1030W64-NEXT: s_add_u32 s8, s8, s11 +; GFX1030W64-NEXT: s_addc_u32 s10, 0, s10 +; GFX1030W64-NEXT: s_mul_hi_u32 s12, s3, s6 +; GFX1030W64-NEXT: s_add_u32 s7, s8, s7 +; GFX1030W64-NEXT: s_mul_i32 s6, s3, s6 +; GFX1030W64-NEXT: s_addc_u32 s7, s10, s9 ; GFX1030W64-NEXT: s_addc_u32 s8, s12, 0 -; GFX1030W64-NEXT: s_add_u32 s10, s6, s7 +; GFX1030W64-NEXT: s_add_u32 s10, s7, s6 ; GFX1030W64-NEXT: s_addc_u32 s11, 0, s8 ; GFX1030W64-NEXT: s_mul_hi_u32 s6, s4, s10 ; GFX1030W64-NEXT: s_mul_i32 s7, s4, s11 ; GFX1030W64-NEXT: s_mul_i32 s8, s5, s10 ; GFX1030W64-NEXT: s_add_i32 s6, s6, s7 -; GFX1030W64-NEXT: s_add_i32 s12, s6, s8 +; GFX1030W64-NEXT: s_add_i32 s8, s6, s8 ; GFX1030W64-NEXT: s_mul_i32 s6, s4, s10 -; GFX1030W64-NEXT: s_sub_i32 s8, s3, s12 -; GFX1030W64-NEXT: s_sub_u32 s13, s2, s6 +; GFX1030W64-NEXT: s_sub_i32 s9, s3, s8 +; GFX1030W64-NEXT: s_sub_u32 s12, s2, s6 ; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1030W64-NEXT: s_subb_u32 s14, s8, s5 -; GFX1030W64-NEXT: s_sub_u32 s15, s13, s4 -; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0 -; GFX1030W64-NEXT: s_subb_u32 s8, s14, 0 -; GFX1030W64-NEXT: s_cmp_ge_u32 s8, s5 -; GFX1030W64-NEXT: s_cselect_b32 s9, -1, 0 -; GFX1030W64-NEXT: s_cmp_ge_u32 s15, s4 +; GFX1030W64-NEXT: s_subb_u32 s9, s9, s5 +; GFX1030W64-NEXT: s_sub_u32 s13, s12, s4 +; GFX1030W64-NEXT: s_subb_u32 s9, s9, 0 +; GFX1030W64-NEXT: s_cmp_ge_u32 s9, s5 ; GFX1030W64-NEXT: s_cselect_b32 s14, -1, 0 -; GFX1030W64-NEXT: s_cmp_eq_u32 s8, s5 -; GFX1030W64-NEXT: s_cselect_b32 s8, s14, s9 -; GFX1030W64-NEXT: s_add_u32 s9, s10, 1 +; GFX1030W64-NEXT: s_cmp_ge_u32 s13, s4 +; GFX1030W64-NEXT: s_cselect_b32 s13, -1, 0 +; GFX1030W64-NEXT: s_cmp_eq_u32 s9, s5 +; GFX1030W64-NEXT: s_cselect_b32 s9, s13, s14 +; GFX1030W64-NEXT: s_add_u32 s13, s10, 1 ; GFX1030W64-NEXT: s_addc_u32 s14, s11, 0 ; GFX1030W64-NEXT: s_add_u32 s15, s10, 2 ; GFX1030W64-NEXT: s_addc_u32 s16, s11, 0 -; GFX1030W64-NEXT: s_cmp_lg_u32 s8, 0 -; GFX1030W64-NEXT: s_cselect_b32 s15, s15, s9 +; GFX1030W64-NEXT: s_cmp_lg_u32 s9, 0 +; GFX1030W64-NEXT: s_cselect_b32 s13, s15, s13 ; GFX1030W64-NEXT: s_cselect_b32 s14, s16, s14 ; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1030W64-NEXT: s_subb_u32 s3, s3, s12 +; GFX1030W64-NEXT: s_subb_u32 s3, s3, s8 ; GFX1030W64-NEXT: s_cmp_ge_u32 s3, s5 ; GFX1030W64-NEXT: s_cselect_b32 s6, -1, 0 -; GFX1030W64-NEXT: s_cmp_ge_u32 s13, s4 +; GFX1030W64-NEXT: s_cmp_ge_u32 s12, s4 ; GFX1030W64-NEXT: s_cselect_b32 s7, -1, 0 ; GFX1030W64-NEXT: s_cmp_eq_u32 s3, s5 ; GFX1030W64-NEXT: s_cselect_b32 s3, s7, s6 ; GFX1030W64-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1030W64-NEXT: s_cselect_b32 s7, s14, s11 -; GFX1030W64-NEXT: s_cselect_b32 s6, s15, s10 +; GFX1030W64-NEXT: s_cselect_b32 s6, s13, s10 ; GFX1030W64-NEXT: s_cbranch_execnz .LBB16_3 ; GFX1030W64-NEXT: .LBB16_2: ; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s4 @@ -2988,44 +2927,40 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-NEXT: s_add_u32 s11, s12, s11 ; GFX11-NEXT: s_addc_u32 s12, 0, s13 ; GFX11-NEXT: s_add_u32 s8, s8, s11 -; GFX11-NEXT: s_cselect_b32 s11, -1, 0 -; GFX11-NEXT: s_mul_hi_u32 s13, s9, s8 -; GFX11-NEXT: s_cmp_lg_u32 s11, 0 -; GFX11-NEXT: s_mul_i32 s11, s9, s8 ; GFX11-NEXT: s_addc_u32 s7, s7, s12 -; GFX11-NEXT: s_mul_i32 s10, s10, s8 +; GFX11-NEXT: s_mul_hi_u32 s11, s9, s8 +; GFX11-NEXT: s_mul_i32 s12, s9, s8 ; GFX11-NEXT: s_mul_i32 s9, s9, s7 -; GFX11-NEXT: s_mul_hi_u32 s12, s8, s11 -; GFX11-NEXT: s_add_i32 s9, s13, s9 -; GFX11-NEXT: s_mul_hi_u32 s13, s7, s11 +; GFX11-NEXT: s_mul_i32 s10, s10, s8 +; GFX11-NEXT: s_add_i32 s9, s11, s9 +; GFX11-NEXT: s_mul_i32 s11, s7, s12 ; GFX11-NEXT: s_add_i32 s9, s9, s10 -; GFX11-NEXT: s_mul_i32 s10, s7, s11 +; GFX11-NEXT: s_mul_hi_u32 s10, s8, s12 ; GFX11-NEXT: s_mul_i32 s15, s8, s9 ; GFX11-NEXT: s_mul_hi_u32 s14, s8, s9 -; GFX11-NEXT: s_add_u32 s12, s12, s15 +; GFX11-NEXT: s_add_u32 s10, s10, s15 +; GFX11-NEXT: s_mul_hi_u32 s13, s7, s12 ; GFX11-NEXT: s_addc_u32 s14, 0, s14 -; GFX11-NEXT: s_mul_hi_u32 s11, s7, s9 -; GFX11-NEXT: s_add_u32 s10, s12, s10 +; GFX11-NEXT: s_mul_hi_u32 s12, s7, s9 +; GFX11-NEXT: s_add_u32 s10, s10, s11 ; GFX11-NEXT: s_mul_i32 s9, s7, s9 ; GFX11-NEXT: s_addc_u32 s10, s14, s13 -; GFX11-NEXT: s_addc_u32 s11, s11, 0 +; GFX11-NEXT: s_addc_u32 s11, s12, 0 ; GFX11-NEXT: s_add_u32 s9, s10, s9 ; GFX11-NEXT: s_addc_u32 s10, 0, s11 ; GFX11-NEXT: s_add_u32 s8, s8, s9 -; GFX11-NEXT: s_cselect_b32 s9, -1, 0 -; GFX11-NEXT: s_mul_hi_u32 s11, s2, s8 -; GFX11-NEXT: s_cmp_lg_u32 s9, 0 -; GFX11-NEXT: s_mul_hi_u32 s9, s3, s8 ; GFX11-NEXT: s_addc_u32 s7, s7, s10 -; GFX11-NEXT: s_mul_i32 s8, s3, s8 +; GFX11-NEXT: s_mul_hi_u32 s9, s2, s8 ; GFX11-NEXT: s_mul_i32 s12, s2, s7 -; GFX11-NEXT: s_mul_hi_u32 s10, s2, s7 -; GFX11-NEXT: s_add_u32 s11, s11, s12 -; GFX11-NEXT: s_addc_u32 s10, 0, s10 +; GFX11-NEXT: s_mul_hi_u32 s11, s2, s7 +; GFX11-NEXT: s_mul_hi_u32 s10, s3, s8 +; GFX11-NEXT: s_mul_i32 s8, s3, s8 +; GFX11-NEXT: s_add_u32 s9, s9, s12 +; GFX11-NEXT: s_addc_u32 s11, 0, s11 ; GFX11-NEXT: s_mul_hi_u32 s13, s3, s7 -; GFX11-NEXT: s_add_u32 s8, s11, s8 +; GFX11-NEXT: s_add_u32 s8, s9, s8 ; GFX11-NEXT: s_mul_i32 s7, s3, s7 -; GFX11-NEXT: s_addc_u32 s8, s10, s9 +; GFX11-NEXT: s_addc_u32 s8, s11, s10 ; GFX11-NEXT: s_addc_u32 s9, s13, 0 ; GFX11-NEXT: s_add_u32 s7, s8, s7 ; GFX11-NEXT: s_addc_u32 s8, 0, s9 @@ -3035,17 +2970,14 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-NEXT: s_add_i32 s9, s9, s10 ; GFX11-NEXT: s_mul_i32 s10, s4, s7 ; GFX11-NEXT: s_add_i32 s9, s9, s11 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_sub_i32 s11, s3, s9 ; GFX11-NEXT: s_sub_u32 s10, s2, s10 ; GFX11-NEXT: s_cselect_b32 s12, -1, 0 -; GFX11-NEXT: s_cmp_lg_u32 s12, 0 ; GFX11-NEXT: s_subb_u32 s11, s11, s5 ; GFX11-NEXT: s_sub_u32 s13, s10, s4 -; GFX11-NEXT: s_cselect_b32 s14, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_cmp_lg_u32 s14, 0 ; GFX11-NEXT: s_subb_u32 s11, s11, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_cmp_ge_u32 s11, s5 ; GFX11-NEXT: s_cselect_b32 s14, -1, 0 ; GFX11-NEXT: s_cmp_ge_u32 s13, s4 @@ -3118,9 +3050,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_or_b64 s[4:5], s[2:3], s[6:7] -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_and_b64 s[4:5], s[4:5], 0xffffffff00000000 -; GFX1250-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1250-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX1250-NEXT: ; %bb.1: ; GFX1250-NEXT: s_cvt_f32_u32 s4, s6 @@ -3155,12 +3086,9 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[4:5], s[12:13] ; GFX1250-NEXT: s_add_co_u32 s8, s8, s12 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1250-NEXT: s_add_co_ci_u32 s9, s9, s13 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: s_mul_u64 s[10:11], s[10:11], s[8:9] -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_mul_hi_u32 s13, s8, s11 ; GFX1250-NEXT: s_mul_i32 s12, s8, s11 ; GFX1250-NEXT: s_mul_hi_u32 s4, s8, s10 @@ -3175,19 +3103,17 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: s_add_nc_u64 s[10:11], s[4:5], s[10:11] ; GFX1250-NEXT: s_add_co_u32 s8, s8, s10 -; GFX1250-NEXT: s_cselect_b32 s10, -1, 0 -; GFX1250-NEXT: s_mul_hi_u32 s4, s2, s8 -; GFX1250-NEXT: s_cmp_lg_u32 s10, 0 -; GFX1250-NEXT: s_mul_hi_u32 s12, s3, s8 ; GFX1250-NEXT: s_add_co_ci_u32 s10, s9, s11 -; GFX1250-NEXT: s_mul_i32 s11, s3, s8 +; GFX1250-NEXT: s_mul_hi_u32 s4, s2, s8 +; GFX1250-NEXT: s_mul_hi_u32 s11, s3, s8 +; GFX1250-NEXT: s_mul_i32 s12, s3, s8 ; GFX1250-NEXT: s_mul_hi_u32 s9, s2, s10 ; GFX1250-NEXT: s_mul_i32 s8, s2, s10 ; GFX1250-NEXT: s_mul_hi_u32 s13, s3, s10 ; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[4:5], s[8:9] ; GFX1250-NEXT: s_mul_i32 s10, s3, s10 -; GFX1250-NEXT: s_add_co_u32 s4, s8, s11 -; GFX1250-NEXT: s_add_co_ci_u32 s4, s9, s12 +; GFX1250-NEXT: s_add_co_u32 s4, s8, s12 +; GFX1250-NEXT: s_add_co_ci_u32 s4, s9, s11 ; GFX1250-NEXT: s_add_co_ci_u32 s11, s13, 0 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[4:5], s[10:11] @@ -3202,10 +3128,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1250-NEXT: s_cmp_lg_u32 s8, 0 ; GFX1250-NEXT: s_sub_co_ci_u32 s12, s12, s7 ; GFX1250-NEXT: s_sub_co_u32 s13, s4, s6 -; GFX1250-NEXT: s_cselect_b32 s14, -1, 0 -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: s_cmp_lg_u32 s14, 0 ; GFX1250-NEXT: s_sub_co_ci_u32 s12, s12, 0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_cmp_ge_u32 s12, s7 ; GFX1250-NEXT: s_cselect_b32 s14, -1, 0 ; GFX1250-NEXT: s_cmp_ge_u32 s13, s6 diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll index 4b151b9..07e6a76 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -714,9 +714,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; VI-NEXT: s_lshl_b32 s2, s2, 8 ; VI-NEXT: s_or_b32 s2, s2, s3 ; VI-NEXT: s_lshl_b32 s3, s2, 16 -; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: s_flbit_i32_b32 s3, s3 -; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: s_cselect_b32 s2, s3, 32 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll index cefcbdd..fca57be 100644 --- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll @@ -1491,7 +1491,6 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace( ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: s_cmp_lg_u32 s4, 0 ; SI-NEXT: s_cbranch_scc0 .LBB14_4 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_mov_b32 s11, 0xf000 @@ -1521,7 +1520,6 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace( ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s6, 16 -; VI-NEXT: s_cmp_lg_u32 s4, 0 ; VI-NEXT: s_cbranch_scc0 .LBB14_4 ; VI-NEXT: ; %bb.1: ; %else ; VI-NEXT: s_mov_b32 s11, 0xf000 diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll index d8a5e7fa..dbdea8e 100644 --- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll +++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll @@ -14,7 +14,6 @@ define i32 @s_add_co_select_user() { ; GFX7-NEXT: s_add_u32 s7, s6, s6 ; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX7-NEXT: s_or_b32 s4, s4, s5 -; GFX7-NEXT: s_cmp_lg_u32 s4, 0 ; GFX7-NEXT: s_addc_u32 s8, s6, 0 ; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], exec @@ -31,8 +30,6 @@ define i32 @s_add_co_select_user() { ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s7, s6, s6 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX9-NEXT: s_addc_u32 s8, s6, 0 ; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec @@ -49,8 +46,6 @@ define i32 @s_add_co_select_user() { ; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s5, s4, s4 -; GFX10-NEXT: s_cselect_b32 s6, -1, 0 -; GFX10-NEXT: s_cmp_lg_u32 s6, 0 ; GFX10-NEXT: s_addc_u32 s6, s4, 0 ; GFX10-NEXT: s_cselect_b32 s7, -1, 0 ; GFX10-NEXT: s_and_b32 s7, s7, exec_lo @@ -67,16 +62,13 @@ define i32 @s_add_co_select_user() { ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s1, s0, s0 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-NEXT: s_addc_u32 s2, s0, 0 ; GFX11-NEXT: s_cselect_b32 s3, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s3, s3, exec_lo ; GFX11-NEXT: s_cselect_b32 s2, s2, 0 ; GFX11-NEXT: s_cmp_gt_u32 s0, 31 ; GFX11-NEXT: s_cselect_b32 s0, s1, s2 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] bb: @@ -104,7 +96,6 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; GFX7-NEXT: s_add_u32 s0, s2, s2 ; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_cmp_lg_u32 s0, 0 ; GFX7-NEXT: s_addc_u32 s0, s2, 0 ; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX7-NEXT: s_andn2_b64 vcc, exec, s[0:1] @@ -125,12 +116,10 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; ; GFX9-LABEL: s_add_co_br_user: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s2, s2 -; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_addc_u32 s0, s2, 0 +; GFX9-NEXT: s_add_u32 s1, s0, s0 +; GFX9-NEXT: s_addc_u32 s0, s0, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GFX9-NEXT: s_cbranch_vccnz .LBB1_2 @@ -153,8 +142,6 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; GFX10-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s1, s0, s0 -; GFX10-NEXT: s_cselect_b32 s1, -1, 0 -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10-NEXT: s_addc_u32 s0, s0, 0 ; GFX10-NEXT: s_cselect_b32 s0, -1, 0 ; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s0 @@ -178,11 +165,9 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s1, s0, s0 -; GFX11-NEXT: s_cselect_b32 s1, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-NEXT: s_addc_u32 s0, s0, 0 ; GFX11-NEXT: s_cselect_b32 s0, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_vccnz .LBB1_2 ; GFX11-NEXT: ; %bb.1: ; %bb0 diff --git a/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll b/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll index 13206ad..f45070c 100644 --- a/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s -; RUN: llc -mtriple=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s +; RUN: llc -mtriple=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s +; RUN: llc -mtriple=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s -; RUN: llc -mtriple=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FASTFMA %s -; RUN: llc -mtriple=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SLOWFMA %s +; RUN: llc -mtriple=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FASTFMA %s +; RUN: llc -mtriple=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SLOWFMA %s ; FIXME: This should also fold when fma is actually fast if an FMA ; exists in the original program. diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index 62847b1..9a17538 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -1117,7 +1117,6 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal ; SI: ; %bb.0: ; SI-NEXT: s_and_b32 s3, s1, 0x1ff ; SI-NEXT: s_or_b32 s0, s3, s0 -; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; SI-NEXT: s_lshr_b32 s0, s1, 8 @@ -1169,7 +1168,6 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal ; VI: ; %bb.0: ; VI-NEXT: s_and_b32 s3, s1, 0x1ff ; VI-NEXT: s_or_b32 s0, s3, s0 -; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; VI-NEXT: s_lshr_b32 s0, s1, 8 @@ -1217,7 +1215,6 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s3, s1, 0x1ff ; GFX9-NEXT: s_or_b32 s0, s3, s0 -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 ; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX9-NEXT: s_lshr_b32 s0, s1, 8 @@ -1264,11 +1261,9 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal ; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_and_b32 s3, s1, 0x1ff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_or_b32 s0, s3, s0 -; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s0, 0 ; GFX11-TRUE16-NEXT: s_cselect_b32 s0, -1, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX11-TRUE16-NEXT: s_bfe_u32 s0, s1, 0xb0014 ; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 8 @@ -1320,11 +1315,9 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal ; GFX11-FAKE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_and_b32 s3, s1, 0x1ff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_or_b32 s0, s3, s0 -; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s0, 0 ; GFX11-FAKE16-NEXT: s_cselect_b32 s0, -1, 0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX11-FAKE16-NEXT: s_bfe_u32 s0, s1, 0xb0014 ; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s1, 8 @@ -4023,7 +4016,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; SI-NEXT: s_and_b32 s6, s4, 0xffe ; SI-NEXT: s_and_b32 s4, s1, 0x1ff ; SI-NEXT: s_or_b32 s0, s4, s0 -; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, s5 ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] @@ -4066,7 +4058,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; SI-NEXT: s_and_b32 s5, s0, 0xffe ; SI-NEXT: s_and_b32 s0, s3, 0x1ff ; SI-NEXT: s_or_b32 s0, s0, s2 -; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; SI-NEXT: v_readfirstlane_b32 s0, v2 @@ -4120,10 +4111,9 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; VI-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_lshr_b32 s5, s3, 8 -; VI-NEXT: s_and_b32 s6, s3, 0x1ff ; VI-NEXT: s_and_b32 s5, s5, 0xffe +; VI-NEXT: s_and_b32 s6, s3, 0x1ff ; VI-NEXT: s_or_b32 s2, s6, s2 -; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cselect_b64 s[6:7], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7] ; VI-NEXT: s_bfe_u32 s3, s3, 0xb0014 @@ -4163,7 +4153,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; VI-NEXT: s_and_b32 s7, s2, 0xffe ; VI-NEXT: s_and_b32 s2, s1, 0x1ff ; VI-NEXT: s_or_b32 s0, s2, s0 -; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] ; VI-NEXT: s_bfe_u32 s1, s1, 0xb0014 @@ -4209,10 +4198,9 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; GFX9-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshr_b32 s5, s3, 8 -; GFX9-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX9-NEXT: s_and_b32 s5, s5, 0xffe +; GFX9-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX9-NEXT: s_or_b32 s2, s6, s2 -; GFX9-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7] ; GFX9-NEXT: s_bfe_u32 s6, s3, 0xb0014 @@ -4254,7 +4242,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; GFX9-NEXT: s_and_b32 s6, s2, 0xffe ; GFX9-NEXT: s_and_b32 s2, s1, 0x1ff ; GFX9-NEXT: s_or_b32 s0, s2, s0 -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 @@ -4301,11 +4288,10 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; ; GFX11-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b32 s5, s3, 0x1ff -; GFX11-NEXT: s_lshr_b32 s6, s3, 8 -; GFX11-NEXT: s_or_b32 s2, s5, s2 -; GFX11-NEXT: s_and_b32 s5, s6, 0xffe -; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_lshr_b32 s5, s3, 8 +; GFX11-NEXT: s_and_b32 s6, s3, 0x1ff +; GFX11-NEXT: s_and_b32 s5, s5, 0xffe +; GFX11-NEXT: s_or_b32 s2, s6, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 @@ -4348,13 +4334,12 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x40f ; GFX11-NEXT: s_cselect_b32 s2, s5, s6 ; GFX11-NEXT: s_lshr_b32 s3, s3, 16 -; GFX11-NEXT: s_and_b32 s6, s1, 0x1ff ; GFX11-NEXT: s_lshr_b32 s5, s1, 8 ; GFX11-NEXT: s_and_b32 s3, s3, 0x8000 -; GFX11-NEXT: s_or_b32 s0, s6, s0 +; GFX11-NEXT: s_and_b32 s6, s1, 0x1ff ; GFX11-NEXT: s_and_b32 s5, s5, 0xffe ; GFX11-NEXT: s_or_b32 s2, s3, s2 -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_or_b32 s0, s6, s0 ; GFX11-NEXT: s_cselect_b32 s0, -1, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll index d41e2c6..8df7564 100644 --- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,GFX9 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-TRUE16 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-FAKE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-FAKE16 %s define amdgpu_kernel void @fpext_f16_to_f32( ; SI-LABEL: fpext_f16_to_f32: diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll index a43292d..a043d53 100644 --- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s define amdgpu_kernel void @fptosi_f16_to_i16( diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll index 96cb621..af1ab37 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s define amdgpu_kernel void @fptoui_f16_to_i16( diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll index b0dd187..c28b25c7 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -599,10 +599,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; SI-GISEL-NEXT: s_addk_i32 s3, 0xfc10 ; SI-GISEL-NEXT: s_and_b32 s6, s6, 0xffe ; SI-GISEL-NEXT: s_or_b32 s4, s7, s4 -; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; SI-GISEL-NEXT: s_or_b32 s4, s6, s4 -; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; SI-GISEL-NEXT: s_cselect_b32 s6, 1, 0 ; SI-GISEL-NEXT: s_lshl_b32 s6, s6, 9 ; SI-GISEL-NEXT: s_lshl_b32 s7, s3, 12 @@ -711,10 +709,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; VI-GISEL-NEXT: s_addk_i32 s4, 0xfc10 ; VI-GISEL-NEXT: s_and_b32 s5, s5, 0xffe ; VI-GISEL-NEXT: s_or_b32 s2, s6, s2 -; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; VI-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; VI-GISEL-NEXT: s_or_b32 s2, s5, s2 -; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; VI-GISEL-NEXT: s_sub_i32 s7, 1, s4 ; VI-GISEL-NEXT: s_lshl_b32 s6, s4, 12 @@ -824,10 +820,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX9-GISEL-NEXT: s_addk_i32 s4, 0xfc10 ; GFX9-GISEL-NEXT: s_and_b32 s5, s5, 0xffe ; GFX9-GISEL-NEXT: s_or_b32 s2, s6, s2 -; GFX9-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; GFX9-GISEL-NEXT: s_or_b32 s2, s5, s2 -; GFX9-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX9-GISEL-NEXT: s_sub_i32 s7, 1, s4 ; GFX9-GISEL-NEXT: s_lshl_b32 s6, s4, 12 @@ -937,10 +931,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX950-GISEL-NEXT: s_addk_i32 s4, 0xfc10 ; GFX950-GISEL-NEXT: s_and_b32 s5, s5, 0xffe ; GFX950-GISEL-NEXT: s_or_b32 s2, s6, s2 -; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; GFX950-GISEL-NEXT: s_or_b32 s2, s5, s2 -; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX950-GISEL-NEXT: s_sub_i32 s7, 1, s4 ; GFX950-GISEL-NEXT: s_lshl_b32 s6, s4, 12 @@ -1118,17 +1110,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s3, 0xb0014 ; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s3, 8 -; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2 +; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s4, 0xfc10 ; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe -; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2 ; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, 1, 0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s5, s2 -; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s6, 1, s4 ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s8, s2, 0x1000 @@ -1175,17 +1165,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-FAKE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s3, 0xb0014 ; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s3, 8 -; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2 +; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s4, 0xfc10 ; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe -; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2 ; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, 1, 0 -; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s5, s2 -; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s6, 1, s4 ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s8, s2, 0x1000 @@ -1366,17 +1354,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX1250-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s3, 0xb0014 ; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s3, 8 -; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2 +; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX1250-GISEL-TRUE16-NEXT: s_addk_co_i32 s4, 0xfc10 ; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe -; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2 ; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s2, 1, 0 -; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s5, s2 -; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX1250-GISEL-TRUE16-NEXT: s_sub_co_i32 s6, 1, s4 ; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s8, s2, 0x1000 @@ -1423,17 +1409,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX1250-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s3, 0xb0014 ; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s3, 8 -; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2 +; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX1250-GISEL-FAKE16-NEXT: s_addk_co_i32 s4, 0xfc10 ; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe -; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2 ; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s2, 1, 0 -; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s5, s2 -; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX1250-GISEL-FAKE16-NEXT: s_sub_co_i32 s6, 1, s4 ; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s8, s2, 0x1000 @@ -2154,10 +2138,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; SI-GISEL-NEXT: s_addk_i32 s3, 0xfc10 ; SI-GISEL-NEXT: s_and_b32 s8, s8, 0xffe ; SI-GISEL-NEXT: s_or_b32 s4, s9, s4 -; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; SI-GISEL-NEXT: s_or_b32 s4, s8, s4 -; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; SI-GISEL-NEXT: s_cselect_b32 s8, 1, 0 ; SI-GISEL-NEXT: s_lshl_b32 s8, s8, 9 ; SI-GISEL-NEXT: s_lshl_b32 s9, s3, 12 @@ -2193,12 +2175,10 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; SI-GISEL-NEXT: s_and_b32 s4, s4, 0x8000 ; SI-GISEL-NEXT: s_addk_i32 s5, 0xfc10 ; SI-GISEL-NEXT: s_and_b32 s8, s8, 0xffe -; SI-GISEL-NEXT: s_or_b32 s6, s9, s6 ; SI-GISEL-NEXT: s_or_b32 s3, s4, s3 -; SI-GISEL-NEXT: s_cmp_lg_u32 s6, 0 +; SI-GISEL-NEXT: s_or_b32 s4, s9, s6 ; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; SI-GISEL-NEXT: s_or_b32 s4, s8, s4 -; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; SI-GISEL-NEXT: s_cselect_b32 s6, 1, 0 ; SI-GISEL-NEXT: s_lshl_b32 s6, s6, 9 ; SI-GISEL-NEXT: s_lshl_b32 s8, s5, 12 @@ -2355,10 +2335,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; VI-GISEL-NEXT: s_addk_i32 s2, 0xfc10 ; VI-GISEL-NEXT: s_and_b32 s3, s3, 0xffe ; VI-GISEL-NEXT: s_or_b32 s4, s8, s4 -; VI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; VI-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; VI-GISEL-NEXT: s_or_b32 s3, s3, s4 -; VI-GISEL-NEXT: s_cmp_lg_u32 s3, 0 ; VI-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; VI-GISEL-NEXT: s_sub_i32 s9, 1, s2 ; VI-GISEL-NEXT: s_lshl_b32 s8, s2, 12 @@ -2392,14 +2370,12 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; VI-GISEL-NEXT: s_or_b32 s2, s3, s2 ; VI-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014 ; VI-GISEL-NEXT: s_lshr_b32 s4, s7, 8 -; VI-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff ; VI-GISEL-NEXT: s_addk_i32 s3, 0xfc10 ; VI-GISEL-NEXT: s_and_b32 s4, s4, 0xffe +; VI-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff ; VI-GISEL-NEXT: s_or_b32 s5, s5, s6 -; VI-GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; VI-GISEL-NEXT: s_or_b32 s4, s4, s5 -; VI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; VI-GISEL-NEXT: s_sub_i32 s8, 1, s3 ; VI-GISEL-NEXT: s_lshl_b32 s6, s3, 12 @@ -2555,10 +2531,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX9-GISEL-NEXT: s_addk_i32 s2, 0xfc10 ; GFX9-GISEL-NEXT: s_and_b32 s3, s3, 0xffe ; GFX9-GISEL-NEXT: s_or_b32 s4, s8, s4 -; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX9-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; GFX9-GISEL-NEXT: s_or_b32 s3, s3, s4 -; GFX9-GISEL-NEXT: s_cmp_lg_u32 s3, 0 ; GFX9-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; GFX9-GISEL-NEXT: s_sub_i32 s9, 1, s2 ; GFX9-GISEL-NEXT: s_lshl_b32 s8, s2, 12 @@ -2592,14 +2566,12 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX9-GISEL-NEXT: s_or_b32 s2, s3, s2 ; GFX9-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014 ; GFX9-GISEL-NEXT: s_lshr_b32 s4, s7, 8 -; GFX9-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff ; GFX9-GISEL-NEXT: s_addk_i32 s3, 0xfc10 ; GFX9-GISEL-NEXT: s_and_b32 s4, s4, 0xffe +; GFX9-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff ; GFX9-GISEL-NEXT: s_or_b32 s5, s5, s6 -; GFX9-GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX9-GISEL-NEXT: s_or_b32 s4, s4, s5 -; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX9-GISEL-NEXT: s_sub_i32 s8, 1, s3 ; GFX9-GISEL-NEXT: s_lshl_b32 s6, s3, 12 @@ -2752,10 +2724,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX950-GISEL-NEXT: s_addk_i32 s2, 0xfc10 ; GFX950-GISEL-NEXT: s_and_b32 s3, s3, 0xffe ; GFX950-GISEL-NEXT: s_or_b32 s4, s8, s4 -; GFX950-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; GFX950-GISEL-NEXT: s_or_b32 s3, s3, s4 -; GFX950-GISEL-NEXT: s_cmp_lg_u32 s3, 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; GFX950-GISEL-NEXT: s_sub_i32 s9, 1, s2 ; GFX950-GISEL-NEXT: s_lshl_b32 s8, s2, 12 @@ -2789,14 +2759,12 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX950-GISEL-NEXT: s_or_b32 s2, s3, s2 ; GFX950-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014 ; GFX950-GISEL-NEXT: s_lshr_b32 s4, s7, 8 -; GFX950-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff ; GFX950-GISEL-NEXT: s_addk_i32 s3, 0xfc10 ; GFX950-GISEL-NEXT: s_and_b32 s4, s4, 0xffe +; GFX950-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff ; GFX950-GISEL-NEXT: s_or_b32 s5, s5, s6 -; GFX950-GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX950-GISEL-NEXT: s_or_b32 s4, s4, s5 -; GFX950-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX950-GISEL-NEXT: s_sub_i32 s8, 1, s3 ; GFX950-GISEL-NEXT: s_lshl_b32 s6, s3, 12 @@ -3073,17 +3041,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff ; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s2, s5, 0xb0014 ; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 8 -; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4 +; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff ; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s2, 0xfc10 ; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0xffe -; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4 ; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s3, s4 -; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0 ; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s8, 1, s2 ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s10, s3, 0x1000 @@ -3115,19 +3081,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX11-GISEL-TRUE16-NEXT: s_cmpk_eq_i32 s2, 0x40f ; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, s4, s3 ; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 16 -; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff ; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s7, 0xb0014 ; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s7, 8 ; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0x8000 -; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s6, s8, s6 +; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff ; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s4, 0xfc10 ; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s3, s2 -; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s6, 0 +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s8, s6 ; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s3, 1, 0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s5, s3 -; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s6, 1, s4 ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s9, s3, 0x1000 @@ -3176,17 +3140,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff ; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s2, s5, 0xb0014 ; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 8 -; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4 +; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff ; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s2, 0xfc10 ; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0xffe -; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4 ; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0 -; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s3, s4 -; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0 ; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s8, 1, s2 ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s10, s3, 0x1000 @@ -3218,19 +3180,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX11-GISEL-FAKE16-NEXT: s_cmpk_eq_i32 s2, 0x40f ; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, s4, s3 ; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 16 -; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff ; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s7, 0xb0014 ; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s7, 8 ; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0x8000 -; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s6, s8, s6 +; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff ; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s4, 0xfc10 ; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s3, s2 -; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s6, 0 +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s8, s6 ; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s3, 1, 0 -; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s5, s3 -; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s6, 1, s4 ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s9, s3, 0x1000 @@ -3511,17 +3471,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff ; GFX1250-GISEL-TRUE16-NEXT: s_bfe_u32 s2, s5, 0xb0014 ; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 8 -; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4 +; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff ; GFX1250-GISEL-TRUE16-NEXT: s_addk_co_i32 s2, 0xfc10 ; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0xffe -; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4 ; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0 -; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s3, s3, s4 -; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0 ; GFX1250-GISEL-TRUE16-NEXT: s_sub_co_i32 s8, 1, s2 ; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s10, s3, 0x1000 @@ -3553,19 +3511,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX1250-GISEL-TRUE16-NEXT: s_cmp_eq_u32 s2, 0x40f ; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s2, s4, s3 ; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 16 -; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff ; GFX1250-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s7, 0xb0014 ; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s7, 8 ; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0x8000 -; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s6, s8, s6 +; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff ; GFX1250-GISEL-TRUE16-NEXT: s_addk_co_i32 s4, 0xfc10 ; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe ; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s3, s2 -; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s6, 0 +; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s3, s8, s6 ; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s3, 1, 0 -; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s3, s5, s3 -; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX1250-GISEL-TRUE16-NEXT: s_sub_co_i32 s6, 1, s4 ; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s9, s3, 0x1000 @@ -3614,17 +3570,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff ; GFX1250-GISEL-FAKE16-NEXT: s_bfe_u32 s2, s5, 0xb0014 ; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 8 -; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4 +; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff ; GFX1250-GISEL-FAKE16-NEXT: s_addk_co_i32 s2, 0xfc10 ; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0xffe -; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4 ; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0 -; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s3, s3, s4 -; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0 ; GFX1250-GISEL-FAKE16-NEXT: s_sub_co_i32 s8, 1, s2 ; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s10, s3, 0x1000 @@ -3656,19 +3610,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX1250-GISEL-FAKE16-NEXT: s_cmp_eq_u32 s2, 0x40f ; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s2, s4, s3 ; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 16 -; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff ; GFX1250-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s7, 0xb0014 ; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s7, 8 ; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0x8000 -; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s6, s8, s6 +; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff ; GFX1250-GISEL-FAKE16-NEXT: s_addk_co_i32 s4, 0xfc10 ; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe ; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s3, s2 -; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s6, 0 +; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s3, s8, s6 ; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s3, 1, 0 -; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s3, s5, s3 -; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX1250-GISEL-FAKE16-NEXT: s_sub_co_i32 s6, 1, s4 ; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s9, s3, 0x1000 diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll index 5d31177..b6b26a4 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll @@ -2,14 +2,14 @@ ; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI-SDAG,VI-SAFE-SDAG %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI-GISEL,VI-SAFE-GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI-SDAG,VI-UNSAFE-SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI-SDAG,VI-UNSAFE-SDAG %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX10-SDAG,GFX10-SAFE-SDAG %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX10-GISEL,GFX10-SAFE-GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=0 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX10-SDAG,GFX10-UNSAFE-SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX10-SDAG,GFX10-UNSAFE-SDAG %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-SAFE-SDAG %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-SAFE-GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,+real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-TRUE16 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,-real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-FAKE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-UNSAFE-GISEL-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-UNSAFE-GISEL-FAKE16 %s @@ -182,7 +182,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; SI-NEXT: s_and_b32 s1, s7, 0x1ff ; SI-NEXT: s_and_b32 s8, s0, 0xffe ; SI-NEXT: s_or_b32 s0, s1, s6 -; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; SI-NEXT: s_bfe_u32 s0, s7, 0xb0014 @@ -237,7 +236,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; VI-SDAG-NEXT: s_and_b32 s8, s4, 0xffe ; VI-SDAG-NEXT: s_and_b32 s4, s7, 0x1ff ; VI-SDAG-NEXT: s_or_b32 s4, s4, s6 -; VI-SDAG-NEXT: s_cmp_lg_u32 s4, 0 ; VI-SDAG-NEXT: s_mov_b32 s1, s5 ; VI-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0 ; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] @@ -290,10 +288,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; VI-GISEL-NEXT: s_addk_i32 s4, 0xfc10 ; VI-GISEL-NEXT: s_and_b32 s5, s5, 0xffe ; VI-GISEL-NEXT: s_or_b32 s2, s6, s2 -; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; VI-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; VI-GISEL-NEXT: s_or_b32 s2, s5, s2 -; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; VI-GISEL-NEXT: s_sub_i32 s7, 1, s4 ; VI-GISEL-NEXT: s_lshl_b32 s6, s4, 12 @@ -335,11 +331,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff -; GFX10-SDAG-NEXT: s_lshr_b32 s5, s3, 8 -; GFX10-SDAG-NEXT: s_or_b32 s2, s4, s2 -; GFX10-SDAG-NEXT: s_and_b32 s4, s5, 0xffe -; GFX10-SDAG-NEXT: s_cmp_lg_u32 s2, 0 +; GFX10-SDAG-NEXT: s_lshr_b32 s4, s3, 8 +; GFX10-SDAG-NEXT: s_and_b32 s5, s3, 0x1ff +; GFX10-SDAG-NEXT: s_and_b32 s4, s4, 0xffe +; GFX10-SDAG-NEXT: s_or_b32 s2, s5, s2 ; GFX10-SDAG-NEXT: s_cselect_b32 s2, -1, 0 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 ; GFX10-SDAG-NEXT: s_bfe_u32 s2, s3, 0xb0014 @@ -387,16 +382,14 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX10-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014 ; GFX10-GISEL-NEXT: s_lshr_b32 s5, s3, 8 -; GFX10-GISEL-NEXT: s_or_b32 s2, s6, s2 +; GFX10-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX10-GISEL-NEXT: s_addk_i32 s4, 0xfc10 ; GFX10-GISEL-NEXT: s_and_b32 s5, s5, 0xffe -; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX10-GISEL-NEXT: s_or_b32 s2, s6, s2 ; GFX10-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; GFX10-GISEL-NEXT: s_or_b32 s2, s5, s2 -; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX10-GISEL-NEXT: s_sub_i32 s6, 1, s4 ; GFX10-GISEL-NEXT: s_or_b32 s8, s2, 0x1000 @@ -438,11 +431,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff -; GFX11-SDAG-NEXT: s_lshr_b32 s5, s3, 8 -; GFX11-SDAG-NEXT: s_or_b32 s2, s4, s2 -; GFX11-SDAG-NEXT: s_and_b32 s4, s5, 0xffe -; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-SDAG-NEXT: s_lshr_b32 s4, s3, 8 +; GFX11-SDAG-NEXT: s_and_b32 s5, s3, 0x1ff +; GFX11-SDAG-NEXT: s_and_b32 s4, s4, 0xffe +; GFX11-SDAG-NEXT: s_or_b32 s2, s5, s2 ; GFX11-SDAG-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 @@ -498,17 +490,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX11-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014 ; GFX11-GISEL-NEXT: s_lshr_b32 s5, s3, 8 -; GFX11-GISEL-NEXT: s_or_b32 s2, s6, s2 +; GFX11-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX11-GISEL-NEXT: s_addk_i32 s4, 0xfc10 ; GFX11-GISEL-NEXT: s_and_b32 s5, s5, 0xffe -; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-GISEL-NEXT: s_or_b32 s2, s6, s2 ; GFX11-GISEL-NEXT: s_cselect_b32 s2, 1, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: s_or_b32 s2, s5, s2 -; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX11-GISEL-NEXT: s_sub_i32 s6, 1, s4 ; GFX11-GISEL-NEXT: s_or_b32 s8, s2, 0x1000 diff --git a/llvm/test/CodeGen/AMDGPU/fract.f64.ll b/llvm/test/CodeGen/AMDGPU/fract.f64.ll index f09c1c6..cc2e78d 100644 --- a/llvm/test/CodeGen/AMDGPU/fract.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fract.f64.ll @@ -2,8 +2,8 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck --check-prefixes=GCN,CI,FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCN,CI,FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -enable-unsafe-fp-math < %s | FileCheck --check-prefixes=GCN,SI,FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck --check-prefixes=GCN,CI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck --check-prefixes=GCN,SI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCN,CI,FUNC %s declare double @llvm.fabs.f64(double) #0 declare double @llvm.floor.f64(double) #0 diff --git a/llvm/test/CodeGen/AMDGPU/fract.ll b/llvm/test/CodeGen/AMDGPU/fract.ll index 8ef0fcf..723fd93 100644 --- a/llvm/test/CodeGen/AMDGPU/fract.ll +++ b/llvm/test/CodeGen/AMDGPU/fract.ll @@ -1,8 +1,8 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck --check-prefix=GCN %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck --check-prefix=GCN %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefix=GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -enable-unsafe-fp-math < %s | FileCheck --check-prefix=GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck --check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck --check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefix=GCN %s declare float @llvm.fabs.f32(float) #0 declare float @llvm.floor.f32(float) #0 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index 37756d1..31f277f 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -472,7 +472,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -536,11 +535,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -606,7 +604,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -660,12 +657,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1 ; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -710,9 +706,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1 ; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -1690,7 +1685,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -1754,11 +1748,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1824,7 +1817,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -1878,12 +1870,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1 ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1928,9 +1919,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1 ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -2968,7 +2958,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -3032,11 +3021,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3102,7 +3090,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -3156,12 +3143,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1 ; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3206,9 +3192,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1 ; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -3742,7 +3727,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -3806,11 +3790,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: .LBB6_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3876,7 +3859,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -3930,12 +3912,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1 ; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3980,9 +3961,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1 ; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -5019,7 +4999,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -5083,11 +5062,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5153,7 +5131,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -5207,12 +5184,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5270,9 +5246,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -6284,7 +6259,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6354,7 +6328,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6424,7 +6397,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6485,8 +6457,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6550,7 +6520,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7717,7 +7686,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7787,7 +7755,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7857,7 +7824,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7918,8 +7884,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7983,7 +7947,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9150,7 +9113,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9220,7 +9182,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9290,7 +9251,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9351,8 +9311,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9416,7 +9374,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10065,7 +10022,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10135,7 +10091,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10205,7 +10160,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10266,8 +10220,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10331,7 +10283,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11498,7 +11449,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11568,7 +11518,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11638,7 +11587,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11699,8 +11647,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11764,7 +11710,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index 6351bb3..4581efc 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -381,13 +381,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-NEXT: .LBB1_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX9-NEXT: v_readlane_b32 s4, v0, s2 +; GFX9-NEXT: v_readlane_b32 s3, v0, s2 +; GFX9-NEXT: v_max_f32_e64 v1, s3, s3 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX9-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX9-NEXT: v_max_f32_e64 v2, s4, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: v_max_f32_e32 v2, v1, v2 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v1 ; GFX9-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -457,7 +456,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -513,7 +511,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 ; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1 @@ -562,8 +559,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -610,11 +606,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -1420,13 +1414,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-NEXT: .LBB3_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX9-NEXT: v_readlane_b32 s4, v0, s2 +; GFX9-NEXT: v_readlane_b32 s3, v0, s2 +; GFX9-NEXT: v_max_f32_e64 v1, s3, s3 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX9-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX9-NEXT: v_max_f32_e64 v2, s4, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: v_max_f32_e32 v2, v1, v2 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v1 ; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1496,7 +1489,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -1552,7 +1544,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 ; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 @@ -1601,8 +1592,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -1649,11 +1639,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -2459,13 +2447,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-NEXT: .LBB5_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX9-NEXT: v_readlane_b32 s4, v0, s2 +; GFX9-NEXT: v_readlane_b32 s3, v0, s2 +; GFX9-NEXT: v_max_f32_e64 v1, s3, s3 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX9-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX9-NEXT: v_max_f32_e64 v2, s4, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: v_max_f32_e32 v2, v1, v2 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v1 ; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2535,7 +2522,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -2591,7 +2577,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 ; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1 @@ -2640,8 +2625,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -2688,11 +2672,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -3591,7 +3573,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -3665,7 +3646,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -3724,7 +3704,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -3774,8 +3753,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -3841,10 +3819,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -4859,7 +4836,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -4933,7 +4909,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -4992,7 +4967,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -5042,8 +5016,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -5109,10 +5082,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -6127,7 +6099,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -6201,7 +6172,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -6260,7 +6230,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -6310,8 +6279,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -6377,10 +6345,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index a9ac008..bd570d9 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -381,13 +381,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-NEXT: .LBB1_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX9-NEXT: v_readlane_b32 s4, v0, s2 +; GFX9-NEXT: v_readlane_b32 s3, v0, s2 +; GFX9-NEXT: v_max_f32_e64 v1, s3, s3 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX9-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX9-NEXT: v_max_f32_e64 v2, s4, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: v_min_f32_e32 v2, v1, v2 +; GFX9-NEXT: v_min_f32_e32 v2, v2, v1 ; GFX9-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -457,7 +456,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -513,7 +511,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 ; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1 @@ -562,8 +559,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -610,11 +606,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -1420,13 +1414,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-NEXT: .LBB3_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX9-NEXT: v_readlane_b32 s4, v0, s2 +; GFX9-NEXT: v_readlane_b32 s3, v0, s2 +; GFX9-NEXT: v_max_f32_e64 v1, s3, s3 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX9-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX9-NEXT: v_max_f32_e64 v2, s4, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: v_min_f32_e32 v2, v1, v2 +; GFX9-NEXT: v_min_f32_e32 v2, v2, v1 ; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1496,7 +1489,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -1552,7 +1544,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 ; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 @@ -1601,8 +1592,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -1649,11 +1639,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -2459,13 +2447,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-NEXT: .LBB5_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX9-NEXT: v_readlane_b32 s4, v0, s2 +; GFX9-NEXT: v_readlane_b32 s3, v0, s2 +; GFX9-NEXT: v_max_f32_e64 v1, s3, s3 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX9-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX9-NEXT: v_max_f32_e64 v2, s4, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: v_min_f32_e32 v2, v1, v2 +; GFX9-NEXT: v_min_f32_e32 v2, v2, v1 ; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2535,7 +2522,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -2591,7 +2577,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 ; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1 @@ -2640,8 +2625,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -2688,11 +2672,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -3591,7 +3573,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -3665,7 +3646,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -3724,7 +3704,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -3774,8 +3753,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -3841,10 +3819,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -4859,7 +4836,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -4933,7 +4909,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -4992,7 +4967,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -5042,8 +5016,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -5109,10 +5082,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -6127,7 +6099,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -6201,7 +6172,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -6260,7 +6230,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -6310,8 +6279,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -6377,10 +6345,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index 6311143..1f2d70c 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -532,7 +532,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -596,11 +595,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -666,7 +664,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -720,12 +717,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -783,9 +779,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -1862,7 +1857,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -1926,11 +1920,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1996,7 +1989,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -2050,12 +2042,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2113,9 +2104,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -3192,7 +3182,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -3256,11 +3245,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3326,7 +3314,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -3380,12 +3367,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3443,9 +3429,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -4018,7 +4003,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -4082,11 +4066,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: .LBB6_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4152,7 +4135,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -4206,12 +4188,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4269,9 +4250,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -5347,7 +5327,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -5411,11 +5390,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5481,7 +5459,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -5535,12 +5512,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5598,9 +5574,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -6612,7 +6587,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6682,7 +6656,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6752,7 +6725,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6813,8 +6785,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6878,7 +6848,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8044,7 +8013,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8114,7 +8082,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8184,7 +8151,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8245,8 +8211,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8310,7 +8274,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9477,7 +9440,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9547,7 +9509,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9617,7 +9578,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9678,8 +9638,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9743,7 +9701,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10392,7 +10349,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10462,7 +10418,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10532,7 +10487,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10593,8 +10547,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10658,7 +10610,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11824,7 +11775,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11894,7 +11844,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11964,7 +11913,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -12025,8 +11973,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -12090,7 +12036,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/inline-attr.ll b/llvm/test/CodeGen/AMDGPU/inline-attr.ll index 4ae0ba0..4e93eca 100644 --- a/llvm/test/CodeGen/AMDGPU/inline-attr.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-attr.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 -; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -O3 -enable-unsafe-fp-math %s | FileCheck --check-prefixes=GCN,UNSAFE %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -O3 %s | FileCheck --check-prefixes=GCN,UNSAFE %s ; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -O3 -enable-no-nans-fp-math %s | FileCheck --check-prefixes=GCN,NONANS %s ; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -O3 -enable-no-infs-fp-math %s | FileCheck --check-prefixes=GCN,NOINFS %s diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll index eee232a..c3f3917 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll @@ -136,19 +136,17 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: .LBB2_6: ; %bb18 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: v_readfirstlane_b32 s13, v0 -; GFX11-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-NEXT: s_cselect_b32 s1, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 -; GFX11-NEXT: s_and_b32 s1, s8, s1 -; GFX11-NEXT: s_and_b32 s1, s1, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_cselect_b32 s13, -1, 0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13 +; GFX11-NEXT: s_and_b32 s13, s8, s13 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_and_b32 s13, s13, exec_lo ; GFX11-NEXT: v_readfirstlane_b32 s19, v2 -; GFX11-NEXT: s_cselect_b32 s1, s19, s13 -; GFX11-NEXT: s_and_b32 s13, 0xffff, s0 +; GFX11-NEXT: s_cselect_b32 s1, s19, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s1, s1, 1 -; GFX11-NEXT: s_cmp_lg_u32 s13, 0 +; GFX11-NEXT: s_and_b32 s13, 0xffff, s0 ; GFX11-NEXT: s_cselect_b32 s13, -1, 0 ; GFX11-NEXT: s_and_b32 s20, s9, exec_lo ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index 8748aff..6dc9199 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -8265,12 +8265,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_readlane_b32 s6, v1, s3 -; GFX12-NEXT: s_lshl_b32 s7, 1, s3 ; GFX12-NEXT: v_writelane_b32 v0, s0, s3 +; GFX12-NEXT: s_lshl_b32 s3, 1, s3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 s1, s1, s7 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12-NEXT: s_and_not1_b32 s1, s1, s3 ; GFX12-NEXT: s_add_f32 s0, s0, s6 ; GFX12-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX12-NEXT: ; %bb.6: ; %ComputeEnd @@ -8351,14 +8349,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX942-NEXT: .LBB28_5: ; %ComputeLoop ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX942-NEXT: v_readfirstlane_b32 s8, v1 -; GFX942-NEXT: v_readlane_b32 s9, v2, s3 +; GFX942-NEXT: v_readfirstlane_b32 s6, v1 ; GFX942-NEXT: s_mov_b32 m0, s3 +; GFX942-NEXT: v_readlane_b32 s8, v2, s3 +; GFX942-NEXT: v_writelane_b32 v0, s6, m0 +; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX942-NEXT: v_writelane_b32 v0, s8, m0 -; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX942-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX942-NEXT: v_add_f32_e32 v1, s8, v1 ; GFX942-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX942-NEXT: ; %bb.6: ; %ComputeEnd ; GFX942-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -8440,15 +8437,14 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX11-NEXT: .LBB28_5: ; %ComputeLoop ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_ctz_i32_b32 s1, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readlane_b32 s6, v2, s1 -; GFX11-NEXT: s_lshl_b32 s7, 1, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 s0, s0, s7 ; GFX11-NEXT: v_writelane_b32 v0, s3, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_add_f32_e32 v1, s6, v1 -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_lshl_b32 s1, 1, s1 +; GFX11-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX11-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX11-NEXT: ; %bb.6: ; %ComputeEnd ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -8528,11 +8524,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX10-NEXT: s_ff1_i32_b32 s1, s0 ; GFX10-NEXT: v_readfirstlane_b32 s3, v1 ; GFX10-NEXT: v_readlane_b32 s6, v2, s1 -; GFX10-NEXT: s_lshl_b32 s7, 1, s1 -; GFX10-NEXT: s_andn2_b32 s0, s0, s7 ; GFX10-NEXT: v_writelane_b32 v0, s3, s1 ; GFX10-NEXT: v_add_f32_e32 v1, s6, v1 -; GFX10-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-NEXT: s_lshl_b32 s1, 1, s1 +; GFX10-NEXT: s_andn2_b32 s0, s0, s1 ; GFX10-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX10-NEXT: ; %bb.6: ; %ComputeEnd ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -8609,14 +8604,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX90A-NEXT: .LBB28_5: ; %ComputeLoop ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX90A-NEXT: v_readfirstlane_b32 s8, v1 -; GFX90A-NEXT: v_readlane_b32 s9, v2, s3 +; GFX90A-NEXT: v_readfirstlane_b32 s6, v1 ; GFX90A-NEXT: s_mov_b32 m0, s3 +; GFX90A-NEXT: v_readlane_b32 s8, v2, s3 +; GFX90A-NEXT: v_writelane_b32 v0, s6, m0 +; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX90A-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX90A-NEXT: v_writelane_b32 v0, s8, m0 -; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX90A-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX90A-NEXT: v_add_f32_e32 v1, s8, v1 ; GFX90A-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX90A-NEXT: ; %bb.6: ; %ComputeEnd ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -8692,14 +8686,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX908-NEXT: .LBB28_5: ; %ComputeLoop ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX908-NEXT: v_readfirstlane_b32 s8, v1 -; GFX908-NEXT: v_readlane_b32 s9, v2, s3 +; GFX908-NEXT: v_readfirstlane_b32 s6, v1 ; GFX908-NEXT: s_mov_b32 m0, s3 +; GFX908-NEXT: v_readlane_b32 s8, v2, s3 +; GFX908-NEXT: v_writelane_b32 v0, s6, m0 +; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX908-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX908-NEXT: v_writelane_b32 v0, s8, m0 -; GFX908-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX908-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX908-NEXT: v_add_f32_e32 v1, s8, v1 ; GFX908-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX908-NEXT: ; %bb.6: ; %ComputeEnd ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -8776,14 +8769,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX8-NEXT: .LBB28_5: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX8-NEXT: v_readfirstlane_b32 s8, v1 -; GFX8-NEXT: v_readlane_b32 s9, v2, s3 +; GFX8-NEXT: v_readfirstlane_b32 s6, v1 ; GFX8-NEXT: s_mov_b32 m0, s3 +; GFX8-NEXT: v_readlane_b32 s8, v2, s3 +; GFX8-NEXT: v_writelane_b32 v0, s6, m0 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: v_writelane_b32 v0, s8, m0 -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX8-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX8-NEXT: v_add_f32_e32 v1, s8, v1 ; GFX8-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX8-NEXT: ; %bb.6: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9130,12 +9122,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_readlane_b32 s6, v1, s3 -; GFX12-NEXT: s_lshl_b32 s7, 1, s3 ; GFX12-NEXT: v_writelane_b32 v0, s0, s3 +; GFX12-NEXT: s_lshl_b32 s3, 1, s3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 s1, s1, s7 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12-NEXT: s_and_not1_b32 s1, s1, s3 ; GFX12-NEXT: s_add_f32 s0, s0, s6 ; GFX12-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX12-NEXT: ; %bb.6: ; %ComputeEnd @@ -9212,14 +9202,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX942-NEXT: .LBB29_5: ; %ComputeLoop ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX942-NEXT: v_readfirstlane_b32 s8, v1 -; GFX942-NEXT: v_readlane_b32 s9, v2, s3 +; GFX942-NEXT: v_readfirstlane_b32 s6, v1 ; GFX942-NEXT: s_mov_b32 m0, s3 +; GFX942-NEXT: v_readlane_b32 s8, v2, s3 +; GFX942-NEXT: v_writelane_b32 v0, s6, m0 +; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX942-NEXT: v_writelane_b32 v0, s8, m0 -; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX942-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX942-NEXT: v_add_f32_e32 v1, s8, v1 ; GFX942-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX942-NEXT: ; %bb.6: ; %ComputeEnd ; GFX942-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9296,15 +9285,14 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: .LBB29_5: ; %ComputeLoop ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_ctz_i32_b32 s1, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readlane_b32 s6, v2, s1 -; GFX11-NEXT: s_lshl_b32 s7, 1, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 s0, s0, s7 ; GFX11-NEXT: v_writelane_b32 v0, s3, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_add_f32_e32 v1, s6, v1 -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_lshl_b32 s1, 1, s1 +; GFX11-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX11-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX11-NEXT: ; %bb.6: ; %ComputeEnd ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9377,11 +9365,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX10-NEXT: s_ff1_i32_b32 s1, s0 ; GFX10-NEXT: v_readfirstlane_b32 s3, v1 ; GFX10-NEXT: v_readlane_b32 s6, v2, s1 -; GFX10-NEXT: s_lshl_b32 s7, 1, s1 -; GFX10-NEXT: s_andn2_b32 s0, s0, s7 ; GFX10-NEXT: v_writelane_b32 v0, s3, s1 ; GFX10-NEXT: v_add_f32_e32 v1, s6, v1 -; GFX10-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-NEXT: s_lshl_b32 s1, 1, s1 +; GFX10-NEXT: s_andn2_b32 s0, s0, s1 ; GFX10-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX10-NEXT: ; %bb.6: ; %ComputeEnd ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9453,14 +9440,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: .LBB29_5: ; %ComputeLoop ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX90A-NEXT: v_readfirstlane_b32 s8, v1 -; GFX90A-NEXT: v_readlane_b32 s9, v2, s3 +; GFX90A-NEXT: v_readfirstlane_b32 s6, v1 ; GFX90A-NEXT: s_mov_b32 m0, s3 +; GFX90A-NEXT: v_readlane_b32 s8, v2, s3 +; GFX90A-NEXT: v_writelane_b32 v0, s6, m0 +; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX90A-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX90A-NEXT: v_writelane_b32 v0, s8, m0 -; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX90A-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX90A-NEXT: v_add_f32_e32 v1, s8, v1 ; GFX90A-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX90A-NEXT: ; %bb.6: ; %ComputeEnd ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9533,14 +9519,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX908-NEXT: .LBB29_5: ; %ComputeLoop ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX908-NEXT: v_readfirstlane_b32 s8, v1 -; GFX908-NEXT: v_readlane_b32 s9, v2, s3 +; GFX908-NEXT: v_readfirstlane_b32 s6, v1 ; GFX908-NEXT: s_mov_b32 m0, s3 +; GFX908-NEXT: v_readlane_b32 s8, v2, s3 +; GFX908-NEXT: v_writelane_b32 v0, s6, m0 +; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX908-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX908-NEXT: v_writelane_b32 v0, s8, m0 -; GFX908-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX908-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX908-NEXT: v_add_f32_e32 v1, s8, v1 ; GFX908-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX908-NEXT: ; %bb.6: ; %ComputeEnd ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9614,14 +9599,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: .LBB29_5: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX8-NEXT: v_readfirstlane_b32 s8, v1 -; GFX8-NEXT: v_readlane_b32 s9, v2, s3 +; GFX8-NEXT: v_readfirstlane_b32 s6, v1 ; GFX8-NEXT: s_mov_b32 m0, s3 +; GFX8-NEXT: v_readlane_b32 s8, v2, s3 +; GFX8-NEXT: v_writelane_b32 v0, s6, m0 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: v_writelane_b32 v0, s8, m0 -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX8-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX8-NEXT: v_add_f32_e32 v1, s8, v1 ; GFX8-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX8-NEXT: ; %bb.6: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir index c1cf06e..fba42c4 100644 --- a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir +++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir @@ -388,9 +388,8 @@ body: | ; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY]], implicit-def $scc - ; GCN-NEXT: S_NOP 0, implicit killed $scc - ; GCN-NEXT: S_CMP_EQ_U32 killed [[S_AND_B32_]], 1, implicit-def $scc + ; GCN-NEXT: S_BITCMP1_B32 killed [[COPY]], 0, implicit-def $scc + ; GCN-NEXT: S_NOP 0, implicit $scc ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc ; GCN-NEXT: S_BRANCH %bb.1 ; GCN-NEXT: {{ $}} @@ -417,6 +416,80 @@ body: | S_ENDPGM 0 ... +--- +name: xor_1_cmp_lg_0_killed_scc +body: | + ; GCN-LABEL: name: xor_1_cmp_lg_0_killed_scc + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GCN-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 1, killed [[COPY]], implicit-def $scc + ; GCN-NEXT: S_NOP 0, implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1(0x40000000), %bb.2(0x40000000) + liveins: $sgpr0, $vgpr0_vgpr1 + + %0:sreg_32 = COPY $sgpr0 + %1:sreg_32 = S_XOR_B32 1, killed %0, implicit-def $scc + S_NOP 0, implicit killed $scc + S_CMP_LG_U32 killed %1:sreg_32, 0, implicit-def $scc + S_CBRANCH_SCC0 %bb.2, implicit $scc + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2(0x80000000) + + bb.2: + S_ENDPGM 0 + +... +--- +name: absdiff_1_cmp_lg_0_killed_scc +body: | + ; GCN-LABEL: name: absdiff_1_cmp_lg_0_killed_scc + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GCN-NEXT: [[S_ABSDIFF_I32_:%[0-9]+]]:sreg_32 = S_ABSDIFF_I32 1, killed [[COPY]], implicit-def $scc + ; GCN-NEXT: S_NOP 0, implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1(0x40000000), %bb.2(0x40000000) + liveins: $sgpr0, $vgpr0_vgpr1 + + %0:sreg_32 = COPY $sgpr0 + %1:sreg_32 = S_ABSDIFF_I32 1, killed %0, implicit-def $scc + S_NOP 0, implicit killed $scc + S_CMP_LG_U32 killed %1:sreg_32, 0, implicit-def $scc + S_CBRANCH_SCC0 %bb.2, implicit $scc + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2(0x80000000) + + bb.2: + S_ENDPGM 0 + +... --- name: and_1_cmp_eq_1_clobbered_scc @@ -2070,8 +2143,7 @@ body: | ; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 3, killed [[COPY]], implicit-def dead $scc - ; GCN-NEXT: S_CMP_LG_U32 killed [[S_AND_B32_]], 0, implicit-def $scc + ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 3, killed [[COPY]], implicit-def $scc ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc ; GCN-NEXT: S_BRANCH %bb.1 ; GCN-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/prevent-fmul-hoist-ir.ll b/llvm/test/CodeGen/AMDGPU/prevent-fmul-hoist-ir.ll index ef3e04c..6ce614b 100644 --- a/llvm/test/CodeGen/AMDGPU/prevent-fmul-hoist-ir.ll +++ b/llvm/test/CodeGen/AMDGPU/prevent-fmul-hoist-ir.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -S -passes='simplifycfg<hoist-common-insts>' -mtriple=amdgcn-- --fp-contract=fast -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX -check-prefix=FP-CONTRACT-FAST %s -; RUN: opt -S -passes='simplifycfg<hoist-common-insts>' -mtriple=amdgcn-- --fp-contract=off --enable-unsafe-fp-math -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX -check-prefix=UNSAFE-FP-MATH %s +; RUN: opt -S -passes='simplifycfg<hoist-common-insts>' -mtriple=amdgcn-- --fp-contract=off -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX -check-prefix=UNSAFE-FP-MATH %s ; RUN: opt -S -passes='simplifycfg<hoist-common-insts>' -mtriple=amdgcn-- --fp-contract=off -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX -check-prefix=NO-UNSAFE-FP-MATH %s define double @is_profitable_f64_contract(ptr dereferenceable(8) %ptr_x, ptr dereferenceable(8) %ptr_y, ptr dereferenceable(8) %ptr_a) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll index f53aaaa..dd5f838 100644 --- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll +++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s declare i32 @llvm.ctpop.i32(i32) declare i64 @llvm.ctpop.i64(i64) @@ -10,7 +10,6 @@ define amdgpu_ps i32 @shl32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: shl32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_lshl_b32 s0, s0, s1 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -25,7 +24,6 @@ define amdgpu_ps i32 @shl64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: shl64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -40,7 +38,6 @@ define amdgpu_ps i32 @lshr32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: lshr32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_lshr_b32 s0, s0, s1 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -55,7 +52,6 @@ define amdgpu_ps i32 @lshr64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: lshr64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -70,7 +66,6 @@ define amdgpu_ps i32 @ashr32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: ashr32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_ashr_i32 s0, s0, s1 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -85,7 +80,6 @@ define amdgpu_ps i32 @ashr64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: ashr64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_ashr_i64 s[0:1], s[0:1], s2 -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -100,7 +94,6 @@ define amdgpu_ps i32 @abs32(i32 inreg %val0) { ; CHECK-LABEL: abs32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_abs_i32 s0, s0 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s0 ; CHECK-NEXT: ;;#ASMEND @@ -121,7 +114,6 @@ define amdgpu_ps i32 @and32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: and32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_and_b32 s0, s0, s1 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -136,7 +128,6 @@ define amdgpu_ps i32 @and64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: and64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -151,7 +142,6 @@ define amdgpu_ps i32 @or32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: or32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_or_b32 s0, s0, s1 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -166,7 +156,6 @@ define amdgpu_ps i32 @or64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: or64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -181,7 +170,6 @@ define amdgpu_ps i32 @xor32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: xor32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_xor_b32 s0, s0, s1 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -196,7 +184,6 @@ define amdgpu_ps i32 @xor64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: xor64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -211,7 +198,6 @@ define amdgpu_ps i32 @nand32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: nand32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_nand_b32 s0, s0, s1 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s0 ; CHECK-NEXT: ;;#ASMEND @@ -231,7 +217,6 @@ define amdgpu_ps i32 @nand64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: nand64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_nand_b64 s[0:1], s[0:1], s[2:3] -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND @@ -251,7 +236,6 @@ define amdgpu_ps i32 @nor32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: nor32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_nor_b32 s0, s0, s1 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s0 ; CHECK-NEXT: ;;#ASMEND @@ -271,7 +255,6 @@ define amdgpu_ps i32 @nor64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: nor64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_nor_b64 s[0:1], s[0:1], s[2:3] -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND @@ -291,7 +274,6 @@ define amdgpu_ps i32 @xnor32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: xnor32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_xnor_b32 s0, s0, s1 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s0 ; CHECK-NEXT: ;;#ASMEND @@ -311,7 +293,6 @@ define amdgpu_ps i32 @xnor64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: xnor64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_xnor_b64 s[0:1], s[0:1], s[2:3] -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND @@ -331,7 +312,6 @@ define amdgpu_ps i32 @andn232(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: andn232: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_andn2_b32 s0, s0, s1 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -347,7 +327,6 @@ define amdgpu_ps i32 @nandn264(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: nandn264: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -363,7 +342,6 @@ define amdgpu_ps i32 @orn232(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: orn232: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_orn2_b32 s0, s0, s1 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -379,7 +357,6 @@ define amdgpu_ps i32 @orn264(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: orn264: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_orn2_b64 s[0:1], s[0:1], s[2:3] -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -395,7 +372,6 @@ define amdgpu_ps i32 @bfe_i32(i32 inreg %val0) { ; CHECK-LABEL: bfe_i32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_bfe_i32 s0, s0, 0x80010 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -433,7 +409,6 @@ define amdgpu_ps i32 @bfe_u32(i32 inreg %val0) { ; CHECK-LABEL: bfe_u32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_bfe_u32 s0, s0, 0x80010 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -513,7 +488,6 @@ define amdgpu_ps i32 @bcnt132(i32 inreg %val0) { ; CHECK-LABEL: bcnt132: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_bcnt1_i32_b32 s0, s0 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s0 ; CHECK-NEXT: ;;#ASMEND @@ -552,7 +526,6 @@ define amdgpu_ps i32 @quadmask32(i32 inreg %val0) { ; CHECK-LABEL: quadmask32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_quadmask_b32 s0, s0 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s0 ; CHECK-NEXT: ;;#ASMEND @@ -571,7 +544,6 @@ define amdgpu_ps i32 @quadmask64(i64 inreg %val0) { ; CHECK-LABEL: quadmask64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_quadmask_b64 s[0:1], s[0:1] -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND @@ -590,7 +562,6 @@ define amdgpu_ps i32 @not32(i32 inreg %val0) { ; CHECK-LABEL: not32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_not_b32 s0, s0 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s0 ; CHECK-NEXT: ;;#ASMEND @@ -609,7 +580,6 @@ define amdgpu_ps i32 @not64(i64 inreg %val0) { ; CHECK-LABEL: not64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_not_b64 s[0:1], s[0:1] -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND @@ -623,3 +593,35 @@ define amdgpu_ps i32 @not64(i64 inreg %val0) { %zext = zext i1 %cmp to i32 ret i32 %zext } + + +; -------------------------------------------------------------------------------- +; Negative tests +; -------------------------------------------------------------------------------- + +@1 = extern_weak dso_local addrspace(4) constant i32 + +define amdgpu_ps i32 @si_pc_add_rel_offset_must_not_optimize() { +; CHECK-LABEL: si_pc_add_rel_offset_must_not_optimize: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_getpc_b64 s[0:1] +; CHECK-NEXT: s_add_u32 s0, s0, __unnamed_1@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s1, s1, __unnamed_1@rel32@hi+12 +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 +; CHECK-NEXT: s_cbranch_scc0 .LBB35_2 +; CHECK-NEXT: ; %bb.1: ; %endif +; CHECK-NEXT: s_mov_b32 s0, 1 +; CHECK-NEXT: s_branch .LBB35_3 +; CHECK-NEXT: .LBB35_2: ; %if +; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: s_branch .LBB35_3 +; CHECK-NEXT: .LBB35_3: + %cmp = icmp ne ptr addrspace(4) @1, null + br i1 %cmp, label %endif, label %if + +if: + ret i32 0 + +endif: + ret i32 1 +} diff --git a/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll b/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll index a828ee0..7552f6b 100644 --- a/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll +++ b/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll @@ -12,8 +12,6 @@ define amdgpu_ps i32 @s_uaddo_pseudo(i32 inreg %val0) { ; CHECK-LABEL: s_uaddo_pseudo: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 s0, s0, 1 -; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_addc_u32 s0, 1, 0 ; CHECK-NEXT: ; return to shader part epilog %pair = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %val0, i32 1) @@ -32,8 +30,6 @@ define amdgpu_ps i32 @s_usubo_pseudo(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: s_usubo_pseudo: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_sub_u32 s0, s0, 1 -; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0 ; CHECK-NEXT: s_subb_u32 s0, s1, 0 ; CHECK-NEXT: ; return to shader part epilog %pair = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %val0, i32 1) diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index 5f6d622..71f5a94 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -56,10 +56,9 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_addc_u32 s15, 0, s16 ; GCN-NEXT: s_add_u32 s16, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: v_mul_hi_u32 v0, s12, v0 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_addc_u32 s14, s14, s15 ; GCN-NEXT: s_mul_i32 s0, s12, s14 ; GCN-NEXT: v_readfirstlane_b32 s1, v0 @@ -90,7 +89,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_add_u32 s15, s16, s0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_addc_u32 s14, s14, s12 ; GCN-NEXT: s_ashr_i32 s12, s7, 31 ; GCN-NEXT: s_add_u32 s0, s6, s12 @@ -116,52 +114,50 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: v_readfirstlane_b32 s4, v0 ; GCN-NEXT: s_addc_u32 s4, s4, 0 ; GCN-NEXT: s_mul_i32 s14, s7, s14 -; GCN-NEXT: s_add_u32 s14, s1, s14 -; GCN-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NEXT: s_add_u32 s16, s1, s14 +; GCN-NEXT: v_mov_b32_e32 v0, s16 ; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 -; GCN-NEXT: s_addc_u32 s15, 0, s4 +; GCN-NEXT: s_addc_u32 s17, 0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_mul_i32 s4, s10, s15 +; GCN-NEXT: s_mul_i32 s4, s10, s17 ; GCN-NEXT: v_readfirstlane_b32 s5, v0 ; GCN-NEXT: s_add_i32 s4, s5, s4 -; GCN-NEXT: s_mul_i32 s5, s11, s14 -; GCN-NEXT: s_add_i32 s16, s4, s5 -; GCN-NEXT: s_sub_i32 s17, s7, s16 -; GCN-NEXT: s_mul_i32 s4, s10, s14 +; GCN-NEXT: s_mul_i32 s5, s11, s16 +; GCN-NEXT: s_add_i32 s18, s4, s5 +; GCN-NEXT: s_sub_i32 s14, s7, s18 +; GCN-NEXT: s_mul_i32 s4, s10, s16 ; GCN-NEXT: s_sub_u32 s6, s6, s4 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s18, s4, s5 -; GCN-NEXT: s_cmp_lg_u32 s18, 0 -; GCN-NEXT: s_subb_u32 s17, s17, s11 -; GCN-NEXT: s_sub_u32 s19, s6, s10 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: s_or_b32 s15, s4, s5 +; GCN-NEXT: s_subb_u32 s19, s14, s11 +; GCN-NEXT: s_sub_u32 s20, s6, s10 +; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GCN-NEXT: s_or_b32 s14, s14, s15 +; GCN-NEXT: s_subb_u32 s14, s19, 0 +; GCN-NEXT: s_cmp_ge_u32 s14, s11 +; GCN-NEXT: s_cselect_b32 s15, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s20, s10 +; GCN-NEXT: s_cselect_b32 s19, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s14, s11 +; GCN-NEXT: s_cselect_b32 s14, s19, s15 +; GCN-NEXT: s_add_u32 s15, s16, 1 +; GCN-NEXT: s_addc_u32 s19, s17, 0 +; GCN-NEXT: s_add_u32 s20, s16, 2 +; GCN-NEXT: s_addc_u32 s21, s17, 0 +; GCN-NEXT: s_cmp_lg_u32 s14, 0 +; GCN-NEXT: s_cselect_b32 s14, s20, s15 +; GCN-NEXT: s_cselect_b32 s15, s21, s19 ; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_subb_u32 s4, s17, 0 +; GCN-NEXT: s_subb_u32 s4, s7, s18 ; GCN-NEXT: s_cmp_ge_u32 s4, s11 ; GCN-NEXT: s_cselect_b32 s5, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s19, s10 -; GCN-NEXT: s_cselect_b32 s17, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s4, s11 -; GCN-NEXT: s_cselect_b32 s4, s17, s5 -; GCN-NEXT: s_add_u32 s5, s14, 1 -; GCN-NEXT: s_addc_u32 s17, s15, 0 -; GCN-NEXT: s_add_u32 s19, s14, 2 -; GCN-NEXT: s_addc_u32 s20, s15, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_cselect_b32 s4, s19, s5 -; GCN-NEXT: s_cselect_b32 s5, s20, s17 -; GCN-NEXT: s_cmp_lg_u32 s18, 0 -; GCN-NEXT: s_subb_u32 s7, s7, s16 -; GCN-NEXT: s_cmp_ge_u32 s7, s11 -; GCN-NEXT: s_cselect_b32 s16, -1, 0 ; GCN-NEXT: s_cmp_ge_u32 s6, s10 ; GCN-NEXT: s_cselect_b32 s6, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s7, s11 -; GCN-NEXT: s_cselect_b32 s6, s6, s16 -; GCN-NEXT: s_cmp_lg_u32 s6, 0 -; GCN-NEXT: s_cselect_b32 s5, s5, s15 -; GCN-NEXT: s_cselect_b32 s4, s4, s14 +; GCN-NEXT: s_cmp_eq_u32 s4, s11 +; GCN-NEXT: s_cselect_b32 s4, s6, s5 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_cselect_b32 s5, s15, s17 +; GCN-NEXT: s_cselect_b32 s4, s14, s16 ; GCN-NEXT: s_xor_b64 s[6:7], s[12:13], s[8:9] ; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GCN-NEXT: s_sub_u32 s4, s4, s6 @@ -208,7 +204,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_add_u32 s18, s16, 1 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_or_b32 s10, s10, s11 -; GCN-IR-NEXT: s_cmp_lg_u32 s10, 0 ; GCN-IR-NEXT: s_addc_u32 s10, s17, 0 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s16, 63, s16 @@ -242,7 +237,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_add_u32 s14, s14, 1 ; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0 ; GCN-IR-NEXT: s_or_b32 s20, s20, s21 -; GCN-IR-NEXT: s_cmp_lg_u32 s20, 0 ; GCN-IR-NEXT: s_addc_u32 s15, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[12:13], s[8:9] @@ -1195,10 +1189,9 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_addc_u32 s12, 0, s13 ; GCN-NEXT: s_add_u32 s13, s8, s9 ; GCN-NEXT: v_mov_b32_e32 v0, s13 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 +; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-NEXT: s_addc_u32 s11, s11, s12 ; GCN-NEXT: s_mul_i32 s8, s2, s11 ; GCN-NEXT: v_readfirstlane_b32 s9, v0 @@ -1229,7 +1222,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_add_u32 s2, s13, s2 ; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-NEXT: s_addc_u32 s8, s11, s10 ; GCN-NEXT: v_mul_hi_u32 v1, s2, 24 ; GCN-NEXT: v_mul_hi_u32 v0, s8, 24 @@ -1238,48 +1230,46 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_readfirstlane_b32 s10, v1 ; GCN-NEXT: v_readfirstlane_b32 s9, v0 ; GCN-NEXT: s_add_u32 s8, s10, s8 -; GCN-NEXT: s_addc_u32 s10, 0, s9 -; GCN-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NEXT: s_addc_u32 s12, 0, s9 +; GCN-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 -; GCN-NEXT: s_mul_i32 s8, s7, s10 +; GCN-NEXT: s_mul_i32 s8, s7, s12 ; GCN-NEXT: v_readfirstlane_b32 s9, v0 -; GCN-NEXT: s_add_i32 s11, s9, s8 -; GCN-NEXT: s_sub_i32 s12, 0, s11 -; GCN-NEXT: s_mul_i32 s8, s6, s10 -; GCN-NEXT: s_sub_u32 s13, 24, s8 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s14, s8, s9 -; GCN-NEXT: s_cmp_lg_u32 s14, 0 -; GCN-NEXT: s_subb_u32 s12, s12, s7 -; GCN-NEXT: s_sub_u32 s15, s13, s6 +; GCN-NEXT: s_add_i32 s13, s9, s8 +; GCN-NEXT: s_sub_i32 s10, 0, s13 +; GCN-NEXT: s_mul_i32 s8, s6, s12 +; GCN-NEXT: s_sub_u32 s14, 24, s8 ; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GCN-NEXT: s_or_b32 s11, s8, s9 +; GCN-NEXT: s_subb_u32 s15, s10, s7 +; GCN-NEXT: s_sub_u32 s16, s14, s6 +; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GCN-NEXT: s_or_b32 s10, s10, s11 +; GCN-NEXT: s_subb_u32 s10, s15, 0 +; GCN-NEXT: s_cmp_ge_u32 s10, s7 +; GCN-NEXT: s_cselect_b32 s11, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s16, s6 +; GCN-NEXT: s_cselect_b32 s15, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s10, s7 +; GCN-NEXT: s_cselect_b32 s10, s15, s11 +; GCN-NEXT: s_add_u32 s11, s12, 1 +; GCN-NEXT: s_addc_u32 s15, 0, 0 +; GCN-NEXT: s_add_u32 s16, s12, 2 +; GCN-NEXT: s_addc_u32 s17, 0, 0 +; GCN-NEXT: s_cmp_lg_u32 s10, 0 +; GCN-NEXT: s_cselect_b32 s10, s16, s11 +; GCN-NEXT: s_cselect_b32 s11, s17, s15 ; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 -; GCN-NEXT: s_subb_u32 s8, s12, 0 +; GCN-NEXT: s_subb_u32 s8, 0, s13 ; GCN-NEXT: s_cmp_ge_u32 s8, s7 ; GCN-NEXT: s_cselect_b32 s9, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s15, s6 -; GCN-NEXT: s_cselect_b32 s12, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s8, s7 -; GCN-NEXT: s_cselect_b32 s8, s12, s9 -; GCN-NEXT: s_add_u32 s9, s10, 1 -; GCN-NEXT: s_addc_u32 s12, 0, 0 -; GCN-NEXT: s_add_u32 s15, s10, 2 -; GCN-NEXT: s_addc_u32 s16, 0, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 -; GCN-NEXT: s_cselect_b32 s8, s15, s9 -; GCN-NEXT: s_cselect_b32 s9, s16, s12 -; GCN-NEXT: s_cmp_lg_u32 s14, 0 -; GCN-NEXT: s_subb_u32 s11, 0, s11 -; GCN-NEXT: s_cmp_ge_u32 s11, s7 -; GCN-NEXT: s_cselect_b32 s12, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s13, s6 +; GCN-NEXT: s_cmp_ge_u32 s14, s6 ; GCN-NEXT: s_cselect_b32 s6, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s11, s7 -; GCN-NEXT: s_cselect_b32 s6, s6, s12 +; GCN-NEXT: s_cmp_eq_u32 s8, s7 +; GCN-NEXT: s_cselect_b32 s6, s6, s9 ; GCN-NEXT: s_cmp_lg_u32 s6, 0 -; GCN-NEXT: s_cselect_b32 s7, s9, 0 -; GCN-NEXT: s_cselect_b32 s6, s8, s10 +; GCN-NEXT: s_cselect_b32 s7, s11, 0 +; GCN-NEXT: s_cselect_b32 s6, s10, s12 ; GCN-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] ; GCN-NEXT: s_sub_u32 s6, s6, s4 ; GCN-NEXT: s_subb_u32 s7, s7, s4 @@ -1315,7 +1305,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s12, s10, 1 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_or_b32 s8, s8, s9 -; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-IR-NEXT: s_addc_u32 s8, s11, 0 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s10, 63, s10 @@ -1348,7 +1337,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s16, s16, 1 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_or_b32 s18, s18, s19 -; GCN-IR-NEXT: s_cmp_lg_u32 s18, 0 ; GCN-IR-NEXT: s_addc_u32 s17, s17, 0 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll index 09596e9..7ddd90e 100644 --- a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s define amdgpu_kernel void @sitofp_i16_to_f16( ; SI-LABEL: sitofp_i16_to_f16: diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll index bbd1793..e12e31b 100644 --- a/llvm/test/CodeGen/AMDGPU/srem.ll +++ b/llvm/test/CodeGen/AMDGPU/srem.ll @@ -1513,7 +1513,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GCN-NEXT: s_sub_u32 s3, 0, s8 -; GCN-NEXT: s_subb_u32 s12, 0, s9 +; GCN-NEXT: s_subb_u32 s10, 0, s9 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -1522,56 +1522,52 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s13, v1 -; GCN-NEXT: v_readfirstlane_b32 s10, v0 -; GCN-NEXT: s_mul_i32 s11, s3, s13 -; GCN-NEXT: s_mul_hi_u32 s15, s3, s10 -; GCN-NEXT: s_mul_i32 s14, s12, s10 -; GCN-NEXT: s_add_i32 s11, s15, s11 -; GCN-NEXT: s_add_i32 s11, s11, s14 -; GCN-NEXT: s_mul_i32 s16, s3, s10 -; GCN-NEXT: s_mul_i32 s15, s10, s11 -; GCN-NEXT: s_mul_hi_u32 s17, s10, s16 -; GCN-NEXT: s_mul_hi_u32 s14, s10, s11 +; GCN-NEXT: v_readfirstlane_b32 s11, v1 +; GCN-NEXT: v_readfirstlane_b32 s12, v0 +; GCN-NEXT: s_mul_i32 s13, s3, s11 +; GCN-NEXT: s_mul_hi_u32 s15, s3, s12 +; GCN-NEXT: s_mul_i32 s14, s10, s12 +; GCN-NEXT: s_add_i32 s13, s15, s13 +; GCN-NEXT: s_add_i32 s13, s13, s14 +; GCN-NEXT: s_mul_i32 s16, s3, s12 +; GCN-NEXT: s_mul_i32 s15, s12, s13 +; GCN-NEXT: s_mul_hi_u32 s17, s12, s16 +; GCN-NEXT: s_mul_hi_u32 s14, s12, s13 ; GCN-NEXT: s_add_u32 s15, s17, s15 ; GCN-NEXT: s_addc_u32 s14, 0, s14 -; GCN-NEXT: s_mul_hi_u32 s18, s13, s16 -; GCN-NEXT: s_mul_i32 s16, s13, s16 +; GCN-NEXT: s_mul_hi_u32 s18, s11, s16 +; GCN-NEXT: s_mul_i32 s16, s11, s16 ; GCN-NEXT: s_add_u32 s15, s15, s16 -; GCN-NEXT: s_mul_hi_u32 s17, s13, s11 +; GCN-NEXT: s_mul_hi_u32 s17, s11, s13 ; GCN-NEXT: s_addc_u32 s14, s14, s18 ; GCN-NEXT: s_addc_u32 s15, s17, 0 -; GCN-NEXT: s_mul_i32 s11, s13, s11 -; GCN-NEXT: s_add_u32 s11, s14, s11 +; GCN-NEXT: s_mul_i32 s13, s11, s13 +; GCN-NEXT: s_add_u32 s13, s14, s13 ; GCN-NEXT: s_addc_u32 s14, 0, s15 -; GCN-NEXT: s_add_u32 s15, s10, s11 -; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GCN-NEXT: s_addc_u32 s13, s13, s14 -; GCN-NEXT: s_mul_i32 s10, s3, s13 -; GCN-NEXT: s_mul_hi_u32 s11, s3, s15 -; GCN-NEXT: s_add_i32 s10, s11, s10 -; GCN-NEXT: s_mul_i32 s12, s12, s15 -; GCN-NEXT: s_add_i32 s10, s10, s12 -; GCN-NEXT: s_mul_i32 s3, s3, s15 -; GCN-NEXT: s_mul_hi_u32 s12, s13, s3 -; GCN-NEXT: s_mul_i32 s14, s13, s3 -; GCN-NEXT: s_mul_i32 s17, s15, s10 -; GCN-NEXT: s_mul_hi_u32 s3, s15, s3 -; GCN-NEXT: s_mul_hi_u32 s16, s15, s10 +; GCN-NEXT: s_add_u32 s12, s12, s13 +; GCN-NEXT: s_addc_u32 s11, s11, s14 +; GCN-NEXT: s_mul_i32 s13, s3, s11 +; GCN-NEXT: s_mul_hi_u32 s14, s3, s12 +; GCN-NEXT: s_add_i32 s13, s14, s13 +; GCN-NEXT: s_mul_i32 s10, s10, s12 +; GCN-NEXT: s_add_i32 s13, s13, s10 +; GCN-NEXT: s_mul_i32 s3, s3, s12 +; GCN-NEXT: s_mul_hi_u32 s14, s11, s3 +; GCN-NEXT: s_mul_i32 s15, s11, s3 +; GCN-NEXT: s_mul_i32 s17, s12, s13 +; GCN-NEXT: s_mul_hi_u32 s3, s12, s3 +; GCN-NEXT: s_mul_hi_u32 s16, s12, s13 ; GCN-NEXT: s_add_u32 s3, s3, s17 ; GCN-NEXT: s_addc_u32 s16, 0, s16 -; GCN-NEXT: s_add_u32 s3, s3, s14 -; GCN-NEXT: s_mul_hi_u32 s11, s13, s10 -; GCN-NEXT: s_addc_u32 s3, s16, s12 -; GCN-NEXT: s_addc_u32 s11, s11, 0 -; GCN-NEXT: s_mul_i32 s10, s13, s10 -; GCN-NEXT: s_add_u32 s3, s3, s10 -; GCN-NEXT: s_addc_u32 s12, 0, s11 -; GCN-NEXT: s_add_u32 s3, s15, s3 -; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GCN-NEXT: s_addc_u32 s14, s13, s12 +; GCN-NEXT: s_add_u32 s3, s3, s15 +; GCN-NEXT: s_mul_hi_u32 s10, s11, s13 +; GCN-NEXT: s_addc_u32 s3, s16, s14 +; GCN-NEXT: s_addc_u32 s10, s10, 0 +; GCN-NEXT: s_mul_i32 s13, s11, s13 +; GCN-NEXT: s_add_u32 s3, s3, s13 +; GCN-NEXT: s_addc_u32 s10, 0, s10 +; GCN-NEXT: s_add_u32 s3, s12, s3 +; GCN-NEXT: s_addc_u32 s14, s11, s10 ; GCN-NEXT: s_ashr_i32 s10, s5, 31 ; GCN-NEXT: s_add_u32 s12, s4, s10 ; GCN-NEXT: s_mov_b32 s11, s10 @@ -1600,11 +1596,9 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: s_mul_i32 s3, s8, s3 ; GCN-NEXT: s_sub_u32 s3, s12, s3 ; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0 ; GCN-NEXT: s_subb_u32 s12, s16, s9 ; GCN-NEXT: s_sub_u32 s18, s3, s8 ; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s19, s12, 0 ; GCN-NEXT: s_cmp_ge_u32 s19, s9 ; GCN-NEXT: s_cselect_b32 s20, -1, 0 @@ -1614,12 +1608,10 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: s_cselect_b32 s20, s21, s20 ; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s12, s12, s9 -; GCN-NEXT: s_sub_u32 s21, s18, s8 -; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 +; GCN-NEXT: s_sub_u32 s16, s18, s8 ; GCN-NEXT: s_subb_u32 s12, s12, 0 ; GCN-NEXT: s_cmp_lg_u32 s20, 0 -; GCN-NEXT: s_cselect_b32 s16, s21, s18 +; GCN-NEXT: s_cselect_b32 s16, s16, s18 ; GCN-NEXT: s_cselect_b32 s12, s12, s19 ; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0 ; GCN-NEXT: s_subb_u32 s5, s13, s5 @@ -1931,11 +1923,9 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; TONGA-NEXT: v_readfirstlane_b32 s14, v0 ; TONGA-NEXT: s_sub_u32 s12, s12, s14 ; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 ; TONGA-NEXT: s_subb_u32 s3, s3, s7 ; TONGA-NEXT: s_sub_u32 s18, s12, s6 ; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s19, s3, 0 ; TONGA-NEXT: s_cmp_ge_u32 s19, s7 ; TONGA-NEXT: s_cselect_b32 s20, -1, 0 @@ -1945,12 +1935,10 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; TONGA-NEXT: s_cselect_b32 s20, s21, s20 ; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s3, s3, s7 -; TONGA-NEXT: s_sub_u32 s21, s18, s6 -; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 +; TONGA-NEXT: s_sub_u32 s16, s18, s6 ; TONGA-NEXT: s_subb_u32 s3, s3, 0 ; TONGA-NEXT: s_cmp_lg_u32 s20, 0 -; TONGA-NEXT: s_cselect_b32 s16, s21, s18 +; TONGA-NEXT: s_cselect_b32 s16, s16, s18 ; TONGA-NEXT: s_cselect_b32 s3, s3, s19 ; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 ; TONGA-NEXT: s_subb_u32 s5, s13, s5 @@ -2730,7 +2718,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GCN-NEXT: s_sub_u32 s9, 0, s6 -; GCN-NEXT: s_subb_u32 s16, 0, s7 +; GCN-NEXT: s_subb_u32 s14, 0, s7 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2739,56 +2727,52 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s17, v1 -; GCN-NEXT: v_readfirstlane_b32 s14, v0 -; GCN-NEXT: s_mul_i32 s15, s9, s17 -; GCN-NEXT: s_mul_hi_u32 s19, s9, s14 -; GCN-NEXT: s_mul_i32 s18, s16, s14 -; GCN-NEXT: s_add_i32 s15, s19, s15 -; GCN-NEXT: s_add_i32 s15, s15, s18 -; GCN-NEXT: s_mul_i32 s20, s9, s14 -; GCN-NEXT: s_mul_i32 s19, s14, s15 -; GCN-NEXT: s_mul_hi_u32 s21, s14, s20 -; GCN-NEXT: s_mul_hi_u32 s18, s14, s15 +; GCN-NEXT: v_readfirstlane_b32 s15, v1 +; GCN-NEXT: v_readfirstlane_b32 s16, v0 +; GCN-NEXT: s_mul_i32 s17, s9, s15 +; GCN-NEXT: s_mul_hi_u32 s19, s9, s16 +; GCN-NEXT: s_mul_i32 s18, s14, s16 +; GCN-NEXT: s_add_i32 s17, s19, s17 +; GCN-NEXT: s_add_i32 s17, s17, s18 +; GCN-NEXT: s_mul_i32 s20, s9, s16 +; GCN-NEXT: s_mul_i32 s19, s16, s17 +; GCN-NEXT: s_mul_hi_u32 s21, s16, s20 +; GCN-NEXT: s_mul_hi_u32 s18, s16, s17 ; GCN-NEXT: s_add_u32 s19, s21, s19 ; GCN-NEXT: s_addc_u32 s18, 0, s18 -; GCN-NEXT: s_mul_hi_u32 s22, s17, s20 -; GCN-NEXT: s_mul_i32 s20, s17, s20 +; GCN-NEXT: s_mul_hi_u32 s22, s15, s20 +; GCN-NEXT: s_mul_i32 s20, s15, s20 ; GCN-NEXT: s_add_u32 s19, s19, s20 -; GCN-NEXT: s_mul_hi_u32 s21, s17, s15 +; GCN-NEXT: s_mul_hi_u32 s21, s15, s17 ; GCN-NEXT: s_addc_u32 s18, s18, s22 ; GCN-NEXT: s_addc_u32 s19, s21, 0 -; GCN-NEXT: s_mul_i32 s15, s17, s15 -; GCN-NEXT: s_add_u32 s15, s18, s15 +; GCN-NEXT: s_mul_i32 s17, s15, s17 +; GCN-NEXT: s_add_u32 s17, s18, s17 ; GCN-NEXT: s_addc_u32 s18, 0, s19 -; GCN-NEXT: s_add_u32 s19, s14, s15 -; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0 -; GCN-NEXT: s_addc_u32 s17, s17, s18 -; GCN-NEXT: s_mul_i32 s14, s9, s17 -; GCN-NEXT: s_mul_hi_u32 s15, s9, s19 -; GCN-NEXT: s_add_i32 s14, s15, s14 -; GCN-NEXT: s_mul_i32 s16, s16, s19 -; GCN-NEXT: s_add_i32 s14, s14, s16 -; GCN-NEXT: s_mul_i32 s9, s9, s19 -; GCN-NEXT: s_mul_hi_u32 s16, s17, s9 -; GCN-NEXT: s_mul_i32 s18, s17, s9 -; GCN-NEXT: s_mul_i32 s21, s19, s14 -; GCN-NEXT: s_mul_hi_u32 s9, s19, s9 -; GCN-NEXT: s_mul_hi_u32 s20, s19, s14 +; GCN-NEXT: s_add_u32 s16, s16, s17 +; GCN-NEXT: s_addc_u32 s15, s15, s18 +; GCN-NEXT: s_mul_i32 s17, s9, s15 +; GCN-NEXT: s_mul_hi_u32 s18, s9, s16 +; GCN-NEXT: s_add_i32 s17, s18, s17 +; GCN-NEXT: s_mul_i32 s14, s14, s16 +; GCN-NEXT: s_add_i32 s17, s17, s14 +; GCN-NEXT: s_mul_i32 s9, s9, s16 +; GCN-NEXT: s_mul_hi_u32 s18, s15, s9 +; GCN-NEXT: s_mul_i32 s19, s15, s9 +; GCN-NEXT: s_mul_i32 s21, s16, s17 +; GCN-NEXT: s_mul_hi_u32 s9, s16, s9 +; GCN-NEXT: s_mul_hi_u32 s20, s16, s17 ; GCN-NEXT: s_add_u32 s9, s9, s21 ; GCN-NEXT: s_addc_u32 s20, 0, s20 -; GCN-NEXT: s_add_u32 s9, s9, s18 -; GCN-NEXT: s_mul_hi_u32 s15, s17, s14 -; GCN-NEXT: s_addc_u32 s9, s20, s16 -; GCN-NEXT: s_addc_u32 s15, s15, 0 -; GCN-NEXT: s_mul_i32 s14, s17, s14 -; GCN-NEXT: s_add_u32 s9, s9, s14 -; GCN-NEXT: s_addc_u32 s16, 0, s15 -; GCN-NEXT: s_add_u32 s9, s19, s9 -; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0 -; GCN-NEXT: s_addc_u32 s18, s17, s16 +; GCN-NEXT: s_add_u32 s9, s9, s19 +; GCN-NEXT: s_mul_hi_u32 s14, s15, s17 +; GCN-NEXT: s_addc_u32 s9, s20, s18 +; GCN-NEXT: s_addc_u32 s14, s14, 0 +; GCN-NEXT: s_mul_i32 s17, s15, s17 +; GCN-NEXT: s_add_u32 s9, s9, s17 +; GCN-NEXT: s_addc_u32 s14, 0, s14 +; GCN-NEXT: s_add_u32 s9, s16, s9 +; GCN-NEXT: s_addc_u32 s18, s15, s14 ; GCN-NEXT: s_ashr_i32 s14, s11, 31 ; GCN-NEXT: s_add_u32 s16, s10, s14 ; GCN-NEXT: s_mov_b32 s15, s14 @@ -2817,11 +2801,9 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mul_i32 s9, s6, s9 ; GCN-NEXT: s_sub_u32 s9, s16, s9 ; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s16, s20, s7 ; GCN-NEXT: s_sub_u32 s22, s9, s6 ; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 ; GCN-NEXT: s_subb_u32 s23, s16, 0 ; GCN-NEXT: s_cmp_ge_u32 s23, s7 ; GCN-NEXT: s_cselect_b32 s24, -1, 0 @@ -2831,12 +2813,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s24, s25, s24 ; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 ; GCN-NEXT: s_subb_u32 s16, s16, s7 -; GCN-NEXT: s_sub_u32 s25, s22, s6 -; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 +; GCN-NEXT: s_sub_u32 s20, s22, s6 ; GCN-NEXT: s_subb_u32 s16, s16, 0 ; GCN-NEXT: s_cmp_lg_u32 s24, 0 -; GCN-NEXT: s_cselect_b32 s20, s25, s22 +; GCN-NEXT: s_cselect_b32 s20, s20, s22 ; GCN-NEXT: s_cselect_b32 s16, s16, s23 ; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s11, s17, s11 @@ -2887,7 +2867,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11 ; GCN-NEXT: s_sub_u32 s3, 0, s10 -; GCN-NEXT: s_subb_u32 s14, 0, s11 +; GCN-NEXT: s_subb_u32 s12, 0, s11 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2896,56 +2876,52 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s15, v1 -; GCN-NEXT: v_readfirstlane_b32 s12, v0 -; GCN-NEXT: s_mul_i32 s13, s3, s15 -; GCN-NEXT: s_mul_hi_u32 s17, s3, s12 -; GCN-NEXT: s_mul_i32 s16, s14, s12 -; GCN-NEXT: s_add_i32 s13, s17, s13 -; GCN-NEXT: s_add_i32 s13, s13, s16 -; GCN-NEXT: s_mul_i32 s18, s3, s12 -; GCN-NEXT: s_mul_i32 s17, s12, s13 -; GCN-NEXT: s_mul_hi_u32 s19, s12, s18 -; GCN-NEXT: s_mul_hi_u32 s16, s12, s13 +; GCN-NEXT: v_readfirstlane_b32 s13, v1 +; GCN-NEXT: v_readfirstlane_b32 s14, v0 +; GCN-NEXT: s_mul_i32 s15, s3, s13 +; GCN-NEXT: s_mul_hi_u32 s17, s3, s14 +; GCN-NEXT: s_mul_i32 s16, s12, s14 +; GCN-NEXT: s_add_i32 s15, s17, s15 +; GCN-NEXT: s_add_i32 s15, s15, s16 +; GCN-NEXT: s_mul_i32 s18, s3, s14 +; GCN-NEXT: s_mul_i32 s17, s14, s15 +; GCN-NEXT: s_mul_hi_u32 s19, s14, s18 +; GCN-NEXT: s_mul_hi_u32 s16, s14, s15 ; GCN-NEXT: s_add_u32 s17, s19, s17 ; GCN-NEXT: s_addc_u32 s16, 0, s16 -; GCN-NEXT: s_mul_hi_u32 s20, s15, s18 -; GCN-NEXT: s_mul_i32 s18, s15, s18 +; GCN-NEXT: s_mul_hi_u32 s20, s13, s18 +; GCN-NEXT: s_mul_i32 s18, s13, s18 ; GCN-NEXT: s_add_u32 s17, s17, s18 -; GCN-NEXT: s_mul_hi_u32 s19, s15, s13 +; GCN-NEXT: s_mul_hi_u32 s19, s13, s15 ; GCN-NEXT: s_addc_u32 s16, s16, s20 ; GCN-NEXT: s_addc_u32 s17, s19, 0 -; GCN-NEXT: s_mul_i32 s13, s15, s13 -; GCN-NEXT: s_add_u32 s13, s16, s13 +; GCN-NEXT: s_mul_i32 s15, s13, s15 +; GCN-NEXT: s_add_u32 s15, s16, s15 ; GCN-NEXT: s_addc_u32 s16, 0, s17 -; GCN-NEXT: s_add_u32 s17, s12, s13 -; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0 -; GCN-NEXT: s_addc_u32 s15, s15, s16 -; GCN-NEXT: s_mul_i32 s12, s3, s15 -; GCN-NEXT: s_mul_hi_u32 s13, s3, s17 -; GCN-NEXT: s_add_i32 s12, s13, s12 -; GCN-NEXT: s_mul_i32 s14, s14, s17 -; GCN-NEXT: s_add_i32 s12, s12, s14 -; GCN-NEXT: s_mul_i32 s3, s3, s17 -; GCN-NEXT: s_mul_hi_u32 s14, s15, s3 -; GCN-NEXT: s_mul_i32 s16, s15, s3 -; GCN-NEXT: s_mul_i32 s19, s17, s12 -; GCN-NEXT: s_mul_hi_u32 s3, s17, s3 -; GCN-NEXT: s_mul_hi_u32 s18, s17, s12 +; GCN-NEXT: s_add_u32 s14, s14, s15 +; GCN-NEXT: s_addc_u32 s13, s13, s16 +; GCN-NEXT: s_mul_i32 s15, s3, s13 +; GCN-NEXT: s_mul_hi_u32 s16, s3, s14 +; GCN-NEXT: s_add_i32 s15, s16, s15 +; GCN-NEXT: s_mul_i32 s12, s12, s14 +; GCN-NEXT: s_add_i32 s15, s15, s12 +; GCN-NEXT: s_mul_i32 s3, s3, s14 +; GCN-NEXT: s_mul_hi_u32 s16, s13, s3 +; GCN-NEXT: s_mul_i32 s17, s13, s3 +; GCN-NEXT: s_mul_i32 s19, s14, s15 +; GCN-NEXT: s_mul_hi_u32 s3, s14, s3 +; GCN-NEXT: s_mul_hi_u32 s18, s14, s15 ; GCN-NEXT: s_add_u32 s3, s3, s19 ; GCN-NEXT: s_addc_u32 s18, 0, s18 -; GCN-NEXT: s_add_u32 s3, s3, s16 -; GCN-NEXT: s_mul_hi_u32 s13, s15, s12 -; GCN-NEXT: s_addc_u32 s3, s18, s14 -; GCN-NEXT: s_addc_u32 s13, s13, 0 -; GCN-NEXT: s_mul_i32 s12, s15, s12 -; GCN-NEXT: s_add_u32 s3, s3, s12 -; GCN-NEXT: s_addc_u32 s14, 0, s13 -; GCN-NEXT: s_add_u32 s3, s17, s3 -; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0 -; GCN-NEXT: s_addc_u32 s16, s15, s14 +; GCN-NEXT: s_add_u32 s3, s3, s17 +; GCN-NEXT: s_mul_hi_u32 s12, s13, s15 +; GCN-NEXT: s_addc_u32 s3, s18, s16 +; GCN-NEXT: s_addc_u32 s12, s12, 0 +; GCN-NEXT: s_mul_i32 s15, s13, s15 +; GCN-NEXT: s_add_u32 s3, s3, s15 +; GCN-NEXT: s_addc_u32 s12, 0, s12 +; GCN-NEXT: s_add_u32 s3, s14, s3 +; GCN-NEXT: s_addc_u32 s16, s13, s12 ; GCN-NEXT: s_ashr_i32 s12, s5, 31 ; GCN-NEXT: s_add_u32 s14, s4, s12 ; GCN-NEXT: s_mov_b32 s13, s12 @@ -2974,11 +2950,9 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mul_i32 s3, s10, s3 ; GCN-NEXT: s_sub_u32 s3, s14, s3 ; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s14, s18, s11 ; GCN-NEXT: s_sub_u32 s20, s3, s10 ; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s21, s14, 0 ; GCN-NEXT: s_cmp_ge_u32 s21, s11 ; GCN-NEXT: s_cselect_b32 s22, -1, 0 @@ -2988,12 +2962,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s22, s23, s22 ; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s14, s14, s11 -; GCN-NEXT: s_sub_u32 s23, s20, s10 -; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 +; GCN-NEXT: s_sub_u32 s18, s20, s10 ; GCN-NEXT: s_subb_u32 s14, s14, 0 ; GCN-NEXT: s_cmp_lg_u32 s22, 0 -; GCN-NEXT: s_cselect_b32 s18, s23, s20 +; GCN-NEXT: s_cselect_b32 s18, s18, s20 ; GCN-NEXT: s_cselect_b32 s14, s14, s21 ; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s5, s15, s5 @@ -3463,11 +3435,9 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_readfirstlane_b32 s14, v0 ; TONGA-NEXT: s_sub_u32 s12, s12, s14 ; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 ; TONGA-NEXT: s_subb_u32 s1, s1, s7 ; TONGA-NEXT: s_sub_u32 s18, s12, s6 ; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s19, s1, 0 ; TONGA-NEXT: s_cmp_ge_u32 s19, s7 ; TONGA-NEXT: s_cselect_b32 s20, -1, 0 @@ -3477,12 +3447,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: s_cselect_b32 s20, s21, s20 ; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s1, s1, s7 -; TONGA-NEXT: s_sub_u32 s21, s18, s6 -; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 +; TONGA-NEXT: s_sub_u32 s16, s18, s6 ; TONGA-NEXT: s_subb_u32 s1, s1, 0 ; TONGA-NEXT: s_cmp_lg_u32 s20, 0 -; TONGA-NEXT: s_cselect_b32 s16, s21, s18 +; TONGA-NEXT: s_cselect_b32 s16, s16, s18 ; TONGA-NEXT: s_cselect_b32 s1, s1, s19 ; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 ; TONGA-NEXT: s_subb_u32 s3, s13, s3 @@ -4934,7 +4902,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GCN-NEXT: s_sub_u32 s17, 0, s6 -; GCN-NEXT: s_subb_u32 s24, 0, s7 +; GCN-NEXT: s_subb_u32 s22, 0, s7 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -4943,56 +4911,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s25, v1 -; GCN-NEXT: v_readfirstlane_b32 s22, v0 -; GCN-NEXT: s_mul_i32 s23, s17, s25 -; GCN-NEXT: s_mul_hi_u32 s27, s17, s22 -; GCN-NEXT: s_mul_i32 s26, s24, s22 -; GCN-NEXT: s_add_i32 s23, s27, s23 -; GCN-NEXT: s_add_i32 s23, s23, s26 -; GCN-NEXT: s_mul_i32 s28, s17, s22 -; GCN-NEXT: s_mul_i32 s27, s22, s23 -; GCN-NEXT: s_mul_hi_u32 s29, s22, s28 -; GCN-NEXT: s_mul_hi_u32 s26, s22, s23 +; GCN-NEXT: v_readfirstlane_b32 s23, v1 +; GCN-NEXT: v_readfirstlane_b32 s24, v0 +; GCN-NEXT: s_mul_i32 s25, s17, s23 +; GCN-NEXT: s_mul_hi_u32 s27, s17, s24 +; GCN-NEXT: s_mul_i32 s26, s22, s24 +; GCN-NEXT: s_add_i32 s25, s27, s25 +; GCN-NEXT: s_add_i32 s25, s25, s26 +; GCN-NEXT: s_mul_i32 s28, s17, s24 +; GCN-NEXT: s_mul_i32 s27, s24, s25 +; GCN-NEXT: s_mul_hi_u32 s29, s24, s28 +; GCN-NEXT: s_mul_hi_u32 s26, s24, s25 ; GCN-NEXT: s_add_u32 s27, s29, s27 ; GCN-NEXT: s_addc_u32 s26, 0, s26 -; GCN-NEXT: s_mul_hi_u32 s30, s25, s28 -; GCN-NEXT: s_mul_i32 s28, s25, s28 +; GCN-NEXT: s_mul_hi_u32 s30, s23, s28 +; GCN-NEXT: s_mul_i32 s28, s23, s28 ; GCN-NEXT: s_add_u32 s27, s27, s28 -; GCN-NEXT: s_mul_hi_u32 s29, s25, s23 +; GCN-NEXT: s_mul_hi_u32 s29, s23, s25 ; GCN-NEXT: s_addc_u32 s26, s26, s30 ; GCN-NEXT: s_addc_u32 s27, s29, 0 -; GCN-NEXT: s_mul_i32 s23, s25, s23 -; GCN-NEXT: s_add_u32 s23, s26, s23 +; GCN-NEXT: s_mul_i32 s25, s23, s25 +; GCN-NEXT: s_add_u32 s25, s26, s25 ; GCN-NEXT: s_addc_u32 s26, 0, s27 -; GCN-NEXT: s_add_u32 s27, s22, s23 -; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0 -; GCN-NEXT: s_addc_u32 s25, s25, s26 -; GCN-NEXT: s_mul_i32 s22, s17, s25 -; GCN-NEXT: s_mul_hi_u32 s23, s17, s27 -; GCN-NEXT: s_add_i32 s22, s23, s22 -; GCN-NEXT: s_mul_i32 s24, s24, s27 -; GCN-NEXT: s_add_i32 s22, s22, s24 -; GCN-NEXT: s_mul_i32 s17, s17, s27 -; GCN-NEXT: s_mul_hi_u32 s24, s25, s17 -; GCN-NEXT: s_mul_i32 s26, s25, s17 -; GCN-NEXT: s_mul_i32 s29, s27, s22 -; GCN-NEXT: s_mul_hi_u32 s17, s27, s17 -; GCN-NEXT: s_mul_hi_u32 s28, s27, s22 +; GCN-NEXT: s_add_u32 s24, s24, s25 +; GCN-NEXT: s_addc_u32 s23, s23, s26 +; GCN-NEXT: s_mul_i32 s25, s17, s23 +; GCN-NEXT: s_mul_hi_u32 s26, s17, s24 +; GCN-NEXT: s_add_i32 s25, s26, s25 +; GCN-NEXT: s_mul_i32 s22, s22, s24 +; GCN-NEXT: s_add_i32 s25, s25, s22 +; GCN-NEXT: s_mul_i32 s17, s17, s24 +; GCN-NEXT: s_mul_hi_u32 s26, s23, s17 +; GCN-NEXT: s_mul_i32 s27, s23, s17 +; GCN-NEXT: s_mul_i32 s29, s24, s25 +; GCN-NEXT: s_mul_hi_u32 s17, s24, s17 +; GCN-NEXT: s_mul_hi_u32 s28, s24, s25 ; GCN-NEXT: s_add_u32 s17, s17, s29 ; GCN-NEXT: s_addc_u32 s28, 0, s28 -; GCN-NEXT: s_add_u32 s17, s17, s26 -; GCN-NEXT: s_mul_hi_u32 s23, s25, s22 -; GCN-NEXT: s_addc_u32 s17, s28, s24 -; GCN-NEXT: s_addc_u32 s23, s23, 0 -; GCN-NEXT: s_mul_i32 s22, s25, s22 -; GCN-NEXT: s_add_u32 s17, s17, s22 -; GCN-NEXT: s_addc_u32 s24, 0, s23 -; GCN-NEXT: s_add_u32 s17, s27, s17 -; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0 -; GCN-NEXT: s_addc_u32 s26, s25, s24 +; GCN-NEXT: s_add_u32 s17, s17, s27 +; GCN-NEXT: s_mul_hi_u32 s22, s23, s25 +; GCN-NEXT: s_addc_u32 s17, s28, s26 +; GCN-NEXT: s_addc_u32 s22, s22, 0 +; GCN-NEXT: s_mul_i32 s25, s23, s25 +; GCN-NEXT: s_add_u32 s17, s17, s25 +; GCN-NEXT: s_addc_u32 s22, 0, s22 +; GCN-NEXT: s_add_u32 s17, s24, s17 +; GCN-NEXT: s_addc_u32 s26, s23, s22 ; GCN-NEXT: s_ashr_i32 s22, s19, 31 ; GCN-NEXT: s_add_u32 s24, s18, s22 ; GCN-NEXT: s_mov_b32 s23, s22 @@ -5021,11 +4985,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mul_i32 s17, s6, s17 ; GCN-NEXT: s_sub_u32 s17, s24, s17 ; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0 ; GCN-NEXT: s_subb_u32 s24, s28, s7 ; GCN-NEXT: s_sub_u32 s30, s17, s6 ; GCN-NEXT: s_cselect_b64 s[28:29], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0 ; GCN-NEXT: s_subb_u32 s31, s24, 0 ; GCN-NEXT: s_cmp_ge_u32 s31, s7 ; GCN-NEXT: s_cselect_b32 s33, -1, 0 @@ -5035,12 +4997,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s33, s34, s33 ; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0 ; GCN-NEXT: s_subb_u32 s24, s24, s7 -; GCN-NEXT: s_sub_u32 s34, s30, s6 -; GCN-NEXT: s_cselect_b64 s[28:29], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0 +; GCN-NEXT: s_sub_u32 s28, s30, s6 ; GCN-NEXT: s_subb_u32 s24, s24, 0 ; GCN-NEXT: s_cmp_lg_u32 s33, 0 -; GCN-NEXT: s_cselect_b32 s28, s34, s30 +; GCN-NEXT: s_cselect_b32 s28, s28, s30 ; GCN-NEXT: s_cselect_b32 s24, s24, s31 ; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0 ; GCN-NEXT: s_subb_u32 s19, s25, s19 @@ -5091,7 +5051,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s18 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s19 ; GCN-NEXT: s_sub_u32 s13, 0, s18 -; GCN-NEXT: s_subb_u32 s22, 0, s19 +; GCN-NEXT: s_subb_u32 s20, 0, s19 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -5100,56 +5060,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s23, v1 -; GCN-NEXT: v_readfirstlane_b32 s20, v0 -; GCN-NEXT: s_mul_i32 s21, s13, s23 -; GCN-NEXT: s_mul_hi_u32 s25, s13, s20 -; GCN-NEXT: s_mul_i32 s24, s22, s20 -; GCN-NEXT: s_add_i32 s21, s25, s21 -; GCN-NEXT: s_add_i32 s21, s21, s24 -; GCN-NEXT: s_mul_i32 s26, s13, s20 -; GCN-NEXT: s_mul_i32 s25, s20, s21 -; GCN-NEXT: s_mul_hi_u32 s27, s20, s26 -; GCN-NEXT: s_mul_hi_u32 s24, s20, s21 +; GCN-NEXT: v_readfirstlane_b32 s21, v1 +; GCN-NEXT: v_readfirstlane_b32 s22, v0 +; GCN-NEXT: s_mul_i32 s23, s13, s21 +; GCN-NEXT: s_mul_hi_u32 s25, s13, s22 +; GCN-NEXT: s_mul_i32 s24, s20, s22 +; GCN-NEXT: s_add_i32 s23, s25, s23 +; GCN-NEXT: s_add_i32 s23, s23, s24 +; GCN-NEXT: s_mul_i32 s26, s13, s22 +; GCN-NEXT: s_mul_i32 s25, s22, s23 +; GCN-NEXT: s_mul_hi_u32 s27, s22, s26 +; GCN-NEXT: s_mul_hi_u32 s24, s22, s23 ; GCN-NEXT: s_add_u32 s25, s27, s25 ; GCN-NEXT: s_addc_u32 s24, 0, s24 -; GCN-NEXT: s_mul_hi_u32 s28, s23, s26 -; GCN-NEXT: s_mul_i32 s26, s23, s26 +; GCN-NEXT: s_mul_hi_u32 s28, s21, s26 +; GCN-NEXT: s_mul_i32 s26, s21, s26 ; GCN-NEXT: s_add_u32 s25, s25, s26 -; GCN-NEXT: s_mul_hi_u32 s27, s23, s21 +; GCN-NEXT: s_mul_hi_u32 s27, s21, s23 ; GCN-NEXT: s_addc_u32 s24, s24, s28 ; GCN-NEXT: s_addc_u32 s25, s27, 0 -; GCN-NEXT: s_mul_i32 s21, s23, s21 -; GCN-NEXT: s_add_u32 s21, s24, s21 +; GCN-NEXT: s_mul_i32 s23, s21, s23 +; GCN-NEXT: s_add_u32 s23, s24, s23 ; GCN-NEXT: s_addc_u32 s24, 0, s25 -; GCN-NEXT: s_add_u32 s25, s20, s21 -; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 -; GCN-NEXT: s_addc_u32 s23, s23, s24 -; GCN-NEXT: s_mul_i32 s20, s13, s23 -; GCN-NEXT: s_mul_hi_u32 s21, s13, s25 -; GCN-NEXT: s_add_i32 s20, s21, s20 -; GCN-NEXT: s_mul_i32 s22, s22, s25 -; GCN-NEXT: s_add_i32 s20, s20, s22 -; GCN-NEXT: s_mul_i32 s13, s13, s25 -; GCN-NEXT: s_mul_hi_u32 s22, s23, s13 -; GCN-NEXT: s_mul_i32 s24, s23, s13 -; GCN-NEXT: s_mul_i32 s27, s25, s20 -; GCN-NEXT: s_mul_hi_u32 s13, s25, s13 -; GCN-NEXT: s_mul_hi_u32 s26, s25, s20 +; GCN-NEXT: s_add_u32 s22, s22, s23 +; GCN-NEXT: s_addc_u32 s21, s21, s24 +; GCN-NEXT: s_mul_i32 s23, s13, s21 +; GCN-NEXT: s_mul_hi_u32 s24, s13, s22 +; GCN-NEXT: s_add_i32 s23, s24, s23 +; GCN-NEXT: s_mul_i32 s20, s20, s22 +; GCN-NEXT: s_add_i32 s23, s23, s20 +; GCN-NEXT: s_mul_i32 s13, s13, s22 +; GCN-NEXT: s_mul_hi_u32 s24, s21, s13 +; GCN-NEXT: s_mul_i32 s25, s21, s13 +; GCN-NEXT: s_mul_i32 s27, s22, s23 +; GCN-NEXT: s_mul_hi_u32 s13, s22, s13 +; GCN-NEXT: s_mul_hi_u32 s26, s22, s23 ; GCN-NEXT: s_add_u32 s13, s13, s27 ; GCN-NEXT: s_addc_u32 s26, 0, s26 -; GCN-NEXT: s_add_u32 s13, s13, s24 -; GCN-NEXT: s_mul_hi_u32 s21, s23, s20 -; GCN-NEXT: s_addc_u32 s13, s26, s22 -; GCN-NEXT: s_addc_u32 s21, s21, 0 -; GCN-NEXT: s_mul_i32 s20, s23, s20 -; GCN-NEXT: s_add_u32 s13, s13, s20 -; GCN-NEXT: s_addc_u32 s22, 0, s21 -; GCN-NEXT: s_add_u32 s13, s25, s13 -; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 -; GCN-NEXT: s_addc_u32 s24, s23, s22 +; GCN-NEXT: s_add_u32 s13, s13, s25 +; GCN-NEXT: s_mul_hi_u32 s20, s21, s23 +; GCN-NEXT: s_addc_u32 s13, s26, s24 +; GCN-NEXT: s_addc_u32 s20, s20, 0 +; GCN-NEXT: s_mul_i32 s23, s21, s23 +; GCN-NEXT: s_add_u32 s13, s13, s23 +; GCN-NEXT: s_addc_u32 s20, 0, s20 +; GCN-NEXT: s_add_u32 s13, s22, s13 +; GCN-NEXT: s_addc_u32 s24, s21, s20 ; GCN-NEXT: s_ashr_i32 s20, s15, 31 ; GCN-NEXT: s_add_u32 s22, s14, s20 ; GCN-NEXT: s_mov_b32 s21, s20 @@ -5178,11 +5134,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mul_i32 s13, s18, s13 ; GCN-NEXT: s_sub_u32 s13, s22, s13 ; GCN-NEXT: s_cselect_b64 s[24:25], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[24:25], 0 ; GCN-NEXT: s_subb_u32 s22, s26, s19 ; GCN-NEXT: s_sub_u32 s28, s13, s18 ; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0 ; GCN-NEXT: s_subb_u32 s29, s22, 0 ; GCN-NEXT: s_cmp_ge_u32 s29, s19 ; GCN-NEXT: s_cselect_b32 s30, -1, 0 @@ -5192,12 +5146,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s30, s31, s30 ; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0 ; GCN-NEXT: s_subb_u32 s22, s22, s19 -; GCN-NEXT: s_sub_u32 s31, s28, s18 -; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0 +; GCN-NEXT: s_sub_u32 s26, s28, s18 ; GCN-NEXT: s_subb_u32 s22, s22, 0 ; GCN-NEXT: s_cmp_lg_u32 s30, 0 -; GCN-NEXT: s_cselect_b32 s26, s31, s28 +; GCN-NEXT: s_cselect_b32 s26, s26, s28 ; GCN-NEXT: s_cselect_b32 s22, s22, s29 ; GCN-NEXT: s_cmp_lg_u64 s[24:25], 0 ; GCN-NEXT: s_subb_u32 s15, s23, s15 @@ -5257,7 +5209,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s14 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s15 ; GCN-NEXT: s_sub_u32 s9, 0, s14 -; GCN-NEXT: s_subb_u32 s18, 0, s15 +; GCN-NEXT: s_subb_u32 s16, 0, s15 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -5266,56 +5218,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s19, v1 -; GCN-NEXT: v_readfirstlane_b32 s16, v0 -; GCN-NEXT: s_mul_i32 s17, s9, s19 -; GCN-NEXT: s_mul_hi_u32 s21, s9, s16 -; GCN-NEXT: s_mul_i32 s20, s18, s16 -; GCN-NEXT: s_add_i32 s17, s21, s17 -; GCN-NEXT: s_add_i32 s17, s17, s20 -; GCN-NEXT: s_mul_i32 s22, s9, s16 -; GCN-NEXT: s_mul_i32 s21, s16, s17 -; GCN-NEXT: s_mul_hi_u32 s23, s16, s22 -; GCN-NEXT: s_mul_hi_u32 s20, s16, s17 +; GCN-NEXT: v_readfirstlane_b32 s17, v1 +; GCN-NEXT: v_readfirstlane_b32 s18, v0 +; GCN-NEXT: s_mul_i32 s19, s9, s17 +; GCN-NEXT: s_mul_hi_u32 s21, s9, s18 +; GCN-NEXT: s_mul_i32 s20, s16, s18 +; GCN-NEXT: s_add_i32 s19, s21, s19 +; GCN-NEXT: s_add_i32 s19, s19, s20 +; GCN-NEXT: s_mul_i32 s22, s9, s18 +; GCN-NEXT: s_mul_i32 s21, s18, s19 +; GCN-NEXT: s_mul_hi_u32 s23, s18, s22 +; GCN-NEXT: s_mul_hi_u32 s20, s18, s19 ; GCN-NEXT: s_add_u32 s21, s23, s21 ; GCN-NEXT: s_addc_u32 s20, 0, s20 -; GCN-NEXT: s_mul_hi_u32 s24, s19, s22 -; GCN-NEXT: s_mul_i32 s22, s19, s22 +; GCN-NEXT: s_mul_hi_u32 s24, s17, s22 +; GCN-NEXT: s_mul_i32 s22, s17, s22 ; GCN-NEXT: s_add_u32 s21, s21, s22 -; GCN-NEXT: s_mul_hi_u32 s23, s19, s17 +; GCN-NEXT: s_mul_hi_u32 s23, s17, s19 ; GCN-NEXT: s_addc_u32 s20, s20, s24 ; GCN-NEXT: s_addc_u32 s21, s23, 0 -; GCN-NEXT: s_mul_i32 s17, s19, s17 -; GCN-NEXT: s_add_u32 s17, s20, s17 +; GCN-NEXT: s_mul_i32 s19, s17, s19 +; GCN-NEXT: s_add_u32 s19, s20, s19 ; GCN-NEXT: s_addc_u32 s20, 0, s21 -; GCN-NEXT: s_add_u32 s21, s16, s17 -; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 -; GCN-NEXT: s_addc_u32 s19, s19, s20 -; GCN-NEXT: s_mul_i32 s16, s9, s19 -; GCN-NEXT: s_mul_hi_u32 s17, s9, s21 -; GCN-NEXT: s_add_i32 s16, s17, s16 -; GCN-NEXT: s_mul_i32 s18, s18, s21 -; GCN-NEXT: s_add_i32 s16, s16, s18 -; GCN-NEXT: s_mul_i32 s9, s9, s21 -; GCN-NEXT: s_mul_hi_u32 s18, s19, s9 -; GCN-NEXT: s_mul_i32 s20, s19, s9 -; GCN-NEXT: s_mul_i32 s23, s21, s16 -; GCN-NEXT: s_mul_hi_u32 s9, s21, s9 -; GCN-NEXT: s_mul_hi_u32 s22, s21, s16 +; GCN-NEXT: s_add_u32 s18, s18, s19 +; GCN-NEXT: s_addc_u32 s17, s17, s20 +; GCN-NEXT: s_mul_i32 s19, s9, s17 +; GCN-NEXT: s_mul_hi_u32 s20, s9, s18 +; GCN-NEXT: s_add_i32 s19, s20, s19 +; GCN-NEXT: s_mul_i32 s16, s16, s18 +; GCN-NEXT: s_add_i32 s19, s19, s16 +; GCN-NEXT: s_mul_i32 s9, s9, s18 +; GCN-NEXT: s_mul_hi_u32 s20, s17, s9 +; GCN-NEXT: s_mul_i32 s21, s17, s9 +; GCN-NEXT: s_mul_i32 s23, s18, s19 +; GCN-NEXT: s_mul_hi_u32 s9, s18, s9 +; GCN-NEXT: s_mul_hi_u32 s22, s18, s19 ; GCN-NEXT: s_add_u32 s9, s9, s23 ; GCN-NEXT: s_addc_u32 s22, 0, s22 -; GCN-NEXT: s_add_u32 s9, s9, s20 -; GCN-NEXT: s_mul_hi_u32 s17, s19, s16 -; GCN-NEXT: s_addc_u32 s9, s22, s18 -; GCN-NEXT: s_addc_u32 s17, s17, 0 -; GCN-NEXT: s_mul_i32 s16, s19, s16 -; GCN-NEXT: s_add_u32 s9, s9, s16 -; GCN-NEXT: s_addc_u32 s18, 0, s17 -; GCN-NEXT: s_add_u32 s9, s21, s9 -; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 -; GCN-NEXT: s_addc_u32 s20, s19, s18 +; GCN-NEXT: s_add_u32 s9, s9, s21 +; GCN-NEXT: s_mul_hi_u32 s16, s17, s19 +; GCN-NEXT: s_addc_u32 s9, s22, s20 +; GCN-NEXT: s_addc_u32 s16, s16, 0 +; GCN-NEXT: s_mul_i32 s19, s17, s19 +; GCN-NEXT: s_add_u32 s9, s9, s19 +; GCN-NEXT: s_addc_u32 s16, 0, s16 +; GCN-NEXT: s_add_u32 s9, s18, s9 +; GCN-NEXT: s_addc_u32 s20, s17, s16 ; GCN-NEXT: s_ashr_i32 s16, s11, 31 ; GCN-NEXT: s_add_u32 s18, s10, s16 ; GCN-NEXT: s_mov_b32 s17, s16 @@ -5344,11 +5292,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mul_i32 s9, s14, s9 ; GCN-NEXT: s_sub_u32 s9, s18, s9 ; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 ; GCN-NEXT: s_subb_u32 s18, s22, s15 ; GCN-NEXT: s_sub_u32 s24, s9, s14 ; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0 ; GCN-NEXT: s_subb_u32 s25, s18, 0 ; GCN-NEXT: s_cmp_ge_u32 s25, s15 ; GCN-NEXT: s_cselect_b32 s26, -1, 0 @@ -5358,12 +5304,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s26, s27, s26 ; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0 ; GCN-NEXT: s_subb_u32 s18, s18, s15 -; GCN-NEXT: s_sub_u32 s27, s24, s14 -; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0 +; GCN-NEXT: s_sub_u32 s22, s24, s14 ; GCN-NEXT: s_subb_u32 s18, s18, 0 ; GCN-NEXT: s_cmp_lg_u32 s26, 0 -; GCN-NEXT: s_cselect_b32 s22, s27, s24 +; GCN-NEXT: s_cselect_b32 s22, s22, s24 ; GCN-NEXT: s_cselect_b32 s18, s18, s25 ; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 ; GCN-NEXT: s_subb_u32 s11, s19, s11 @@ -5420,7 +5364,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11 ; GCN-NEXT: s_sub_u32 s3, 0, s10 -; GCN-NEXT: s_subb_u32 s14, 0, s11 +; GCN-NEXT: s_subb_u32 s12, 0, s11 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -5429,56 +5373,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s15, v1 -; GCN-NEXT: v_readfirstlane_b32 s12, v0 -; GCN-NEXT: s_mul_i32 s13, s3, s15 -; GCN-NEXT: s_mul_hi_u32 s17, s3, s12 -; GCN-NEXT: s_mul_i32 s16, s14, s12 -; GCN-NEXT: s_add_i32 s13, s17, s13 -; GCN-NEXT: s_add_i32 s13, s13, s16 -; GCN-NEXT: s_mul_i32 s18, s3, s12 -; GCN-NEXT: s_mul_i32 s17, s12, s13 -; GCN-NEXT: s_mul_hi_u32 s19, s12, s18 -; GCN-NEXT: s_mul_hi_u32 s16, s12, s13 +; GCN-NEXT: v_readfirstlane_b32 s13, v1 +; GCN-NEXT: v_readfirstlane_b32 s14, v0 +; GCN-NEXT: s_mul_i32 s15, s3, s13 +; GCN-NEXT: s_mul_hi_u32 s17, s3, s14 +; GCN-NEXT: s_mul_i32 s16, s12, s14 +; GCN-NEXT: s_add_i32 s15, s17, s15 +; GCN-NEXT: s_add_i32 s15, s15, s16 +; GCN-NEXT: s_mul_i32 s18, s3, s14 +; GCN-NEXT: s_mul_i32 s17, s14, s15 +; GCN-NEXT: s_mul_hi_u32 s19, s14, s18 +; GCN-NEXT: s_mul_hi_u32 s16, s14, s15 ; GCN-NEXT: s_add_u32 s17, s19, s17 ; GCN-NEXT: s_addc_u32 s16, 0, s16 -; GCN-NEXT: s_mul_hi_u32 s20, s15, s18 -; GCN-NEXT: s_mul_i32 s18, s15, s18 +; GCN-NEXT: s_mul_hi_u32 s20, s13, s18 +; GCN-NEXT: s_mul_i32 s18, s13, s18 ; GCN-NEXT: s_add_u32 s17, s17, s18 -; GCN-NEXT: s_mul_hi_u32 s19, s15, s13 +; GCN-NEXT: s_mul_hi_u32 s19, s13, s15 ; GCN-NEXT: s_addc_u32 s16, s16, s20 ; GCN-NEXT: s_addc_u32 s17, s19, 0 -; GCN-NEXT: s_mul_i32 s13, s15, s13 -; GCN-NEXT: s_add_u32 s13, s16, s13 +; GCN-NEXT: s_mul_i32 s15, s13, s15 +; GCN-NEXT: s_add_u32 s15, s16, s15 ; GCN-NEXT: s_addc_u32 s16, 0, s17 -; GCN-NEXT: s_add_u32 s17, s12, s13 -; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0 -; GCN-NEXT: s_addc_u32 s15, s15, s16 -; GCN-NEXT: s_mul_i32 s12, s3, s15 -; GCN-NEXT: s_mul_hi_u32 s13, s3, s17 -; GCN-NEXT: s_add_i32 s12, s13, s12 -; GCN-NEXT: s_mul_i32 s14, s14, s17 -; GCN-NEXT: s_add_i32 s12, s12, s14 -; GCN-NEXT: s_mul_i32 s3, s3, s17 -; GCN-NEXT: s_mul_hi_u32 s14, s15, s3 -; GCN-NEXT: s_mul_i32 s16, s15, s3 -; GCN-NEXT: s_mul_i32 s19, s17, s12 -; GCN-NEXT: s_mul_hi_u32 s3, s17, s3 -; GCN-NEXT: s_mul_hi_u32 s18, s17, s12 +; GCN-NEXT: s_add_u32 s14, s14, s15 +; GCN-NEXT: s_addc_u32 s13, s13, s16 +; GCN-NEXT: s_mul_i32 s15, s3, s13 +; GCN-NEXT: s_mul_hi_u32 s16, s3, s14 +; GCN-NEXT: s_add_i32 s15, s16, s15 +; GCN-NEXT: s_mul_i32 s12, s12, s14 +; GCN-NEXT: s_add_i32 s15, s15, s12 +; GCN-NEXT: s_mul_i32 s3, s3, s14 +; GCN-NEXT: s_mul_hi_u32 s16, s13, s3 +; GCN-NEXT: s_mul_i32 s17, s13, s3 +; GCN-NEXT: s_mul_i32 s19, s14, s15 +; GCN-NEXT: s_mul_hi_u32 s3, s14, s3 +; GCN-NEXT: s_mul_hi_u32 s18, s14, s15 ; GCN-NEXT: s_add_u32 s3, s3, s19 ; GCN-NEXT: s_addc_u32 s18, 0, s18 -; GCN-NEXT: s_add_u32 s3, s3, s16 -; GCN-NEXT: s_mul_hi_u32 s13, s15, s12 -; GCN-NEXT: s_addc_u32 s3, s18, s14 -; GCN-NEXT: s_addc_u32 s13, s13, 0 -; GCN-NEXT: s_mul_i32 s12, s15, s12 -; GCN-NEXT: s_add_u32 s3, s3, s12 -; GCN-NEXT: s_addc_u32 s14, 0, s13 -; GCN-NEXT: s_add_u32 s3, s17, s3 -; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0 -; GCN-NEXT: s_addc_u32 s16, s15, s14 +; GCN-NEXT: s_add_u32 s3, s3, s17 +; GCN-NEXT: s_mul_hi_u32 s12, s13, s15 +; GCN-NEXT: s_addc_u32 s3, s18, s16 +; GCN-NEXT: s_addc_u32 s12, s12, 0 +; GCN-NEXT: s_mul_i32 s15, s13, s15 +; GCN-NEXT: s_add_u32 s3, s3, s15 +; GCN-NEXT: s_addc_u32 s12, 0, s12 +; GCN-NEXT: s_add_u32 s3, s14, s3 +; GCN-NEXT: s_addc_u32 s16, s13, s12 ; GCN-NEXT: s_ashr_i32 s12, s5, 31 ; GCN-NEXT: s_add_u32 s14, s4, s12 ; GCN-NEXT: s_mov_b32 s13, s12 @@ -5507,11 +5447,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mul_i32 s3, s10, s3 ; GCN-NEXT: s_sub_u32 s3, s14, s3 ; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s14, s18, s11 ; GCN-NEXT: s_sub_u32 s20, s3, s10 ; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s21, s14, 0 ; GCN-NEXT: s_cmp_ge_u32 s21, s11 ; GCN-NEXT: s_cselect_b32 s22, -1, 0 @@ -5521,12 +5459,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s22, s23, s22 ; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s14, s14, s11 -; GCN-NEXT: s_sub_u32 s23, s20, s10 -; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 +; GCN-NEXT: s_sub_u32 s18, s20, s10 ; GCN-NEXT: s_subb_u32 s14, s14, 0 ; GCN-NEXT: s_cmp_lg_u32 s22, 0 -; GCN-NEXT: s_cselect_b32 s18, s23, s20 +; GCN-NEXT: s_cselect_b32 s18, s18, s20 ; GCN-NEXT: s_cselect_b32 s14, s14, s21 ; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s5, s15, s5 @@ -6299,11 +6235,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_readfirstlane_b32 s14, v8 ; TONGA-NEXT: s_sub_u32 s12, s12, s14 ; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 ; TONGA-NEXT: s_subb_u32 s1, s1, s7 ; TONGA-NEXT: s_sub_u32 s18, s12, s6 ; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s19, s1, 0 ; TONGA-NEXT: s_cmp_ge_u32 s19, s7 ; TONGA-NEXT: s_cselect_b32 s20, -1, 0 @@ -6313,12 +6247,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: s_cselect_b32 s20, s21, s20 ; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s1, s1, s7 -; TONGA-NEXT: s_sub_u32 s21, s18, s6 -; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 +; TONGA-NEXT: s_sub_u32 s16, s18, s6 ; TONGA-NEXT: s_subb_u32 s1, s1, 0 ; TONGA-NEXT: s_cmp_lg_u32 s20, 0 -; TONGA-NEXT: s_cselect_b32 s16, s21, s18 +; TONGA-NEXT: s_cselect_b32 s16, s16, s18 ; TONGA-NEXT: s_cselect_b32 s1, s1, s19 ; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 ; TONGA-NEXT: s_subb_u32 s3, s13, s3 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index 33b0a5d..ea9bb04 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -51,10 +51,9 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_addc_u32 s13, 0, s14 ; GCN-NEXT: s_add_u32 s14, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_addc_u32 s12, s12, s13 ; GCN-NEXT: s_mul_i32 s0, s10, s12 ; GCN-NEXT: v_readfirstlane_b32 s1, v0 @@ -85,7 +84,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_add_u32 s11, s14, s0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_addc_u32 s1, s12, s10 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 @@ -115,46 +113,43 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: v_readfirstlane_b32 s10, v0 ; GCN-NEXT: s_add_i32 s5, s10, s5 ; GCN-NEXT: s_mul_i32 s10, s9, s4 -; GCN-NEXT: s_add_i32 s10, s5, s10 -; GCN-NEXT: s_sub_i32 s11, s7, s10 +; GCN-NEXT: s_add_i32 s12, s5, s10 +; GCN-NEXT: s_sub_i32 s10, s7, s12 ; GCN-NEXT: s_mul_i32 s4, s8, s4 ; GCN-NEXT: s_sub_u32 s6, s6, s4 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s12, s4, s5 -; GCN-NEXT: s_cmp_lg_u32 s12, 0 -; GCN-NEXT: s_subb_u32 s11, s11, s9 -; GCN-NEXT: s_sub_u32 s13, s6, s8 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: s_or_b32 s11, s4, s5 +; GCN-NEXT: s_subb_u32 s13, s10, s9 +; GCN-NEXT: s_sub_u32 s14, s6, s8 +; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GCN-NEXT: s_or_b32 s15, s10, s11 +; GCN-NEXT: s_subb_u32 s15, s13, 0 +; GCN-NEXT: s_cmp_ge_u32 s15, s9 +; GCN-NEXT: s_cselect_b32 s16, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s14, s8 +; GCN-NEXT: s_cselect_b32 s17, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s15, s9 +; GCN-NEXT: s_cselect_b32 s16, s17, s16 +; GCN-NEXT: s_or_b32 s10, s10, s11 +; GCN-NEXT: s_subb_u32 s13, s13, s9 +; GCN-NEXT: s_sub_u32 s17, s14, s8 +; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GCN-NEXT: s_or_b32 s10, s10, s11 +; GCN-NEXT: s_subb_u32 s10, s13, 0 +; GCN-NEXT: s_cmp_lg_u32 s16, 0 +; GCN-NEXT: s_cselect_b32 s11, s17, s14 +; GCN-NEXT: s_cselect_b32 s10, s10, s15 ; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_subb_u32 s14, s11, 0 -; GCN-NEXT: s_cmp_ge_u32 s14, s9 +; GCN-NEXT: s_subb_u32 s4, s7, s12 +; GCN-NEXT: s_cmp_ge_u32 s4, s9 ; GCN-NEXT: s_cselect_b32 s5, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s13, s8 -; GCN-NEXT: s_cselect_b32 s15, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s14, s9 -; GCN-NEXT: s_cselect_b32 s15, s15, s5 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_subb_u32 s11, s11, s9 -; GCN-NEXT: s_sub_u32 s16, s13, s8 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_subb_u32 s4, s11, 0 -; GCN-NEXT: s_cmp_lg_u32 s15, 0 -; GCN-NEXT: s_cselect_b32 s5, s16, s13 -; GCN-NEXT: s_cselect_b32 s4, s4, s14 -; GCN-NEXT: s_cmp_lg_u32 s12, 0 -; GCN-NEXT: s_subb_u32 s7, s7, s10 -; GCN-NEXT: s_cmp_ge_u32 s7, s9 -; GCN-NEXT: s_cselect_b32 s10, -1, 0 ; GCN-NEXT: s_cmp_ge_u32 s6, s8 -; GCN-NEXT: s_cselect_b32 s8, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s7, s9 -; GCN-NEXT: s_cselect_b32 s8, s8, s10 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 -; GCN-NEXT: s_cselect_b32 s4, s4, s7 -; GCN-NEXT: s_cselect_b32 s5, s5, s6 +; GCN-NEXT: s_cselect_b32 s7, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s4, s9 +; GCN-NEXT: s_cselect_b32 s5, s7, s5 +; GCN-NEXT: s_cmp_lg_u32 s5, 0 +; GCN-NEXT: s_cselect_b32 s4, s10, s4 +; GCN-NEXT: s_cselect_b32 s5, s11, s6 ; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -187,7 +182,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_add_u32 s14, s12, 1 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_or_b32 s8, s8, s9 -; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-IR-NEXT: s_addc_u32 s8, s13, 0 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 @@ -221,7 +215,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_add_u32 s16, s16, 1 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_or_b32 s18, s18, s19 -; GCN-IR-NEXT: s_cmp_lg_u32 s18, 0 ; GCN-IR-NEXT: s_addc_u32 s17, s17, 0 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] @@ -1016,10 +1009,9 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_addc_u32 s13, 0, s14 ; GCN-NEXT: s_add_u32 s14, s8, s9 ; GCN-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 +; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-NEXT: s_addc_u32 s12, s12, s13 ; GCN-NEXT: s_mul_i32 s8, s10, s12 ; GCN-NEXT: v_readfirstlane_b32 s9, v0 @@ -1050,7 +1042,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_add_u32 s11, s14, s8 ; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-NEXT: s_addc_u32 s10, s12, s10 ; GCN-NEXT: s_ashr_i32 s8, s7, 31 ; GCN-NEXT: s_add_u32 s6, s6, s8 @@ -1083,46 +1074,43 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: v_readfirstlane_b32 s12, v0 ; GCN-NEXT: s_add_i32 s11, s12, s11 ; GCN-NEXT: s_mul_i32 s12, s5, s10 -; GCN-NEXT: s_add_i32 s12, s11, s12 -; GCN-NEXT: s_sub_i32 s13, s7, s12 +; GCN-NEXT: s_add_i32 s14, s11, s12 +; GCN-NEXT: s_sub_i32 s12, s7, s14 ; GCN-NEXT: s_mul_i32 s10, s4, s10 ; GCN-NEXT: s_sub_u32 s6, s6, s10 ; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s14, s10, s11 -; GCN-NEXT: s_cmp_lg_u32 s14, 0 -; GCN-NEXT: s_subb_u32 s13, s13, s5 -; GCN-NEXT: s_sub_u32 s15, s6, s4 -; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s10, s10, s11 -; GCN-NEXT: s_cmp_lg_u32 s10, 0 -; GCN-NEXT: s_subb_u32 s16, s13, 0 -; GCN-NEXT: s_cmp_ge_u32 s16, s5 -; GCN-NEXT: s_cselect_b32 s11, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s15, s4 -; GCN-NEXT: s_cselect_b32 s17, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s16, s5 -; GCN-NEXT: s_cselect_b32 s17, s17, s11 -; GCN-NEXT: s_cmp_lg_u32 s10, 0 -; GCN-NEXT: s_subb_u32 s13, s13, s5 -; GCN-NEXT: s_sub_u32 s18, s15, s4 -; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GCN-NEXT: s_or_b32 s13, s10, s11 +; GCN-NEXT: s_subb_u32 s15, s12, s5 +; GCN-NEXT: s_sub_u32 s16, s6, s4 +; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GCN-NEXT: s_or_b32 s17, s12, s13 +; GCN-NEXT: s_subb_u32 s17, s15, 0 +; GCN-NEXT: s_cmp_ge_u32 s17, s5 +; GCN-NEXT: s_cselect_b32 s18, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s16, s4 +; GCN-NEXT: s_cselect_b32 s19, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s17, s5 +; GCN-NEXT: s_cselect_b32 s18, s19, s18 +; GCN-NEXT: s_or_b32 s12, s12, s13 +; GCN-NEXT: s_subb_u32 s15, s15, s5 +; GCN-NEXT: s_sub_u32 s19, s16, s4 +; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GCN-NEXT: s_or_b32 s12, s12, s13 +; GCN-NEXT: s_subb_u32 s12, s15, 0 +; GCN-NEXT: s_cmp_lg_u32 s18, 0 +; GCN-NEXT: s_cselect_b32 s13, s19, s16 +; GCN-NEXT: s_cselect_b32 s12, s12, s17 ; GCN-NEXT: s_or_b32 s10, s10, s11 -; GCN-NEXT: s_cmp_lg_u32 s10, 0 -; GCN-NEXT: s_subb_u32 s10, s13, 0 -; GCN-NEXT: s_cmp_lg_u32 s17, 0 -; GCN-NEXT: s_cselect_b32 s11, s18, s15 -; GCN-NEXT: s_cselect_b32 s10, s10, s16 -; GCN-NEXT: s_cmp_lg_u32 s14, 0 -; GCN-NEXT: s_subb_u32 s7, s7, s12 +; GCN-NEXT: s_subb_u32 s7, s7, s14 ; GCN-NEXT: s_cmp_ge_u32 s7, s5 -; GCN-NEXT: s_cselect_b32 s12, -1, 0 +; GCN-NEXT: s_cselect_b32 s10, -1, 0 ; GCN-NEXT: s_cmp_ge_u32 s6, s4 ; GCN-NEXT: s_cselect_b32 s4, -1, 0 ; GCN-NEXT: s_cmp_eq_u32 s7, s5 -; GCN-NEXT: s_cselect_b32 s4, s4, s12 +; GCN-NEXT: s_cselect_b32 s4, s4, s10 ; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_cselect_b32 s5, s10, s7 -; GCN-NEXT: s_cselect_b32 s4, s11, s6 +; GCN-NEXT: s_cselect_b32 s5, s12, s7 +; GCN-NEXT: s_cselect_b32 s4, s13, s6 ; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] ; GCN-NEXT: s_sub_u32 s4, s4, s8 ; GCN-NEXT: s_subb_u32 s5, s5, s8 @@ -1170,7 +1158,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_add_u32 s16, s14, 1 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_or_b32 s10, s10, s11 -; GCN-IR-NEXT: s_cmp_lg_u32 s10, 0 ; GCN-IR-NEXT: s_addc_u32 s10, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s14, 63, s14 @@ -1204,7 +1191,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_add_u32 s18, s18, 1 ; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0 ; GCN-IR-NEXT: s_or_b32 s20, s20, s21 -; GCN-IR-NEXT: s_cmp_lg_u32 s20, 0 ; GCN-IR-NEXT: s_addc_u32 s19, s19, 0 ; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[12:13], s[2:3] @@ -1369,10 +1355,9 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_addc_u32 s10, 0, s11 ; GCN-NEXT: s_add_u32 s11, s6, s7 ; GCN-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 +; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-NEXT: s_or_b32 s6, s6, s7 -; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_addc_u32 s9, s9, s10 ; GCN-NEXT: s_mul_i32 s6, s2, s9 ; GCN-NEXT: v_readfirstlane_b32 s7, v0 @@ -1403,7 +1388,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_add_u32 s2, s11, s2 ; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-NEXT: s_or_b32 s6, s6, s7 -; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_addc_u32 s6, s9, s8 ; GCN-NEXT: v_mul_hi_u32 v1, s2, 24 ; GCN-NEXT: v_mul_hi_u32 v0, s6, 24 @@ -1418,45 +1402,42 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_mul_i32 s7, s5, s6 ; GCN-NEXT: s_mul_i32 s6, s4, s6 ; GCN-NEXT: v_readfirstlane_b32 s8, v0 -; GCN-NEXT: s_add_i32 s8, s8, s7 -; GCN-NEXT: s_sub_i32 s9, 0, s8 -; GCN-NEXT: s_sub_u32 s10, 24, s6 -; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GCN-NEXT: s_or_b32 s11, s6, s7 -; GCN-NEXT: s_cmp_lg_u32 s11, 0 -; GCN-NEXT: s_subb_u32 s9, s9, s5 -; GCN-NEXT: s_sub_u32 s12, s10, s4 +; GCN-NEXT: s_add_i32 s10, s8, s7 +; GCN-NEXT: s_sub_i32 s8, 0, s10 +; GCN-NEXT: s_sub_u32 s11, 24, s6 ; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN-NEXT: s_or_b32 s9, s6, s7 +; GCN-NEXT: s_subb_u32 s12, s8, s5 +; GCN-NEXT: s_sub_u32 s13, s11, s4 +; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GCN-NEXT: s_or_b32 s14, s8, s9 +; GCN-NEXT: s_subb_u32 s14, s12, 0 +; GCN-NEXT: s_cmp_ge_u32 s14, s5 +; GCN-NEXT: s_cselect_b32 s15, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s13, s4 +; GCN-NEXT: s_cselect_b32 s16, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s14, s5 +; GCN-NEXT: s_cselect_b32 s15, s16, s15 +; GCN-NEXT: s_or_b32 s8, s8, s9 +; GCN-NEXT: s_subb_u32 s12, s12, s5 +; GCN-NEXT: s_sub_u32 s16, s13, s4 +; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GCN-NEXT: s_or_b32 s8, s8, s9 +; GCN-NEXT: s_subb_u32 s8, s12, 0 +; GCN-NEXT: s_cmp_lg_u32 s15, 0 +; GCN-NEXT: s_cselect_b32 s9, s16, s13 +; GCN-NEXT: s_cselect_b32 s8, s8, s14 ; GCN-NEXT: s_or_b32 s6, s6, s7 -; GCN-NEXT: s_cmp_lg_u32 s6, 0 -; GCN-NEXT: s_subb_u32 s13, s9, 0 -; GCN-NEXT: s_cmp_ge_u32 s13, s5 +; GCN-NEXT: s_subb_u32 s6, 0, s10 +; GCN-NEXT: s_cmp_ge_u32 s6, s5 ; GCN-NEXT: s_cselect_b32 s7, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s12, s4 -; GCN-NEXT: s_cselect_b32 s14, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s13, s5 -; GCN-NEXT: s_cselect_b32 s14, s14, s7 -; GCN-NEXT: s_cmp_lg_u32 s6, 0 -; GCN-NEXT: s_subb_u32 s9, s9, s5 -; GCN-NEXT: s_sub_u32 s15, s12, s4 -; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GCN-NEXT: s_or_b32 s6, s6, s7 -; GCN-NEXT: s_cmp_lg_u32 s6, 0 -; GCN-NEXT: s_subb_u32 s6, s9, 0 -; GCN-NEXT: s_cmp_lg_u32 s14, 0 -; GCN-NEXT: s_cselect_b32 s7, s15, s12 -; GCN-NEXT: s_cselect_b32 s6, s6, s13 -; GCN-NEXT: s_cmp_lg_u32 s11, 0 -; GCN-NEXT: s_subb_u32 s8, 0, s8 -; GCN-NEXT: s_cmp_ge_u32 s8, s5 -; GCN-NEXT: s_cselect_b32 s9, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s10, s4 +; GCN-NEXT: s_cmp_ge_u32 s11, s4 ; GCN-NEXT: s_cselect_b32 s4, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s8, s5 -; GCN-NEXT: s_cselect_b32 s4, s4, s9 +; GCN-NEXT: s_cmp_eq_u32 s6, s5 +; GCN-NEXT: s_cselect_b32 s4, s4, s7 ; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_cselect_b32 s4, s6, s8 -; GCN-NEXT: s_cselect_b32 s5, s7, s10 +; GCN-NEXT: s_cselect_b32 s4, s8, s6 +; GCN-NEXT: s_cselect_b32 s5, s9, s11 ; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1489,7 +1470,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s8, s2, 1 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_or_b32 s9, s10, s11 -; GCN-IR-NEXT: s_cmp_lg_u32 s9, 0 ; GCN-IR-NEXT: s_addc_u32 s3, s3, 0 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s2, 63, s2 @@ -1522,7 +1502,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s14, s14, 1 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_or_b32 s16, s16, s17 -; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0 ; GCN-IR-NEXT: s_addc_u32 s15, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll index bb5918b2..bdd22f25 100644 --- a/llvm/test/CodeGen/AMDGPU/uaddo.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll @@ -18,7 +18,6 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: s_or_b32 s0, s0, s1 -; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_addc_u32 s3, s3, s9 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -35,10 +34,8 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s2, s2, s4 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 ; VI-NEXT: s_addc_u32 s3, s3, s5 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -53,14 +50,12 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s6, s2, s6 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_addc_u32 s4, s3, s7 +; GFX9-NEXT: s_add_u32 s4, s2, s6 +; GFX9-NEXT: s_addc_u32 s5, s3, s7 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm @@ -73,8 +68,6 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s2, s2, s6 -; GFX10-NEXT: s_cselect_b32 s4, -1, 0 -; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: s_addc_u32 s3, s3, s7 ; GFX10-NEXT: s_cselect_b32 s4, -1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 @@ -91,14 +84,12 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s2, s2, s4 -; GFX11-NEXT: s_cselect_b32 s4, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_cmp_lg_u32 s4, 0 ; GFX11-NEXT: s_addc_u32 s3, s3, s5 ; GFX11-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, s2, s2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm @@ -444,7 +435,6 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_add_u32 s4, s4, s6 ; SI-NEXT: s_cselect_b64 s[12:13], -1, 0 ; SI-NEXT: s_or_b32 s6, s12, s13 -; SI-NEXT: s_cmp_lg_u32 s6, 0 ; SI-NEXT: s_addc_u32 s5, s5, s7 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 @@ -465,16 +455,14 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: s_add_u32 s2, s4, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_add_u32 s0, s4, s6 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_addc_u32 s1, s5, s7 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 -; VI-NEXT: s_addc_u32 s0, s5, s7 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: v_mov_b32_e32 v5, s0 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -486,12 +474,10 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s2, s12, s14 -; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_addc_u32 s0, s13, s15 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_add_u32 s0, s12, s14 +; GFX9-NEXT: s_addc_u32 s1, s13, s15 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -504,10 +490,8 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s12, s14 -; GFX10-NEXT: s_cselect_b32 s1, -1, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10-NEXT: s_addc_u32 s1, s13, s15 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_cselect_b32 s0, -1, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 @@ -520,10 +504,8 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s4, s4, s6 -; GFX11-NEXT: s_cselect_b32 s6, -1, 0 -; GFX11-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-NEXT: s_cmp_lg_u32 s6, 0 ; GFX11-NEXT: s_addc_u32 s5, s5, s7 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-NEXT: s_cselect_b32 s4, -1, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index 41199b0..fd461ac 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -148,7 +148,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_add_u32 s14, s12, 1 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_or_b32 s8, s8, s9 -; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-IR-NEXT: s_addc_u32 s8, s13, 0 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 @@ -182,7 +181,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_add_u32 s10, s10, 1 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_or_b32 s16, s16, s17 -; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0 ; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[2:3], s[4:5] @@ -831,10 +829,9 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_addc_u32 s10, 0, s11 ; GCN-NEXT: s_add_u32 s11, s4, s5 ; GCN-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: s_addc_u32 s9, s9, s10 ; GCN-NEXT: s_mul_i32 s4, s6, s9 ; GCN-NEXT: v_readfirstlane_b32 s5, v0 @@ -865,7 +862,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_add_u32 s8, s11, s4 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: s_addc_u32 s4, s9, s6 ; GCN-NEXT: v_mul_hi_u32 v1, s8, 24 ; GCN-NEXT: v_mul_hi_u32 v0, s4, 24 @@ -874,52 +870,50 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_readfirstlane_b32 s8, v1 ; GCN-NEXT: v_readfirstlane_b32 s5, v0 ; GCN-NEXT: s_add_u32 s4, s8, s4 -; GCN-NEXT: s_addc_u32 s8, 0, s5 -; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: s_addc_u32 s10, 0, s5 +; GCN-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_mul_i32 s0, s3, s8 +; GCN-NEXT: s_mul_i32 s0, s3, s10 ; GCN-NEXT: v_readfirstlane_b32 s1, v0 -; GCN-NEXT: s_add_i32 s9, s1, s0 -; GCN-NEXT: s_sub_i32 s10, 0, s9 -; GCN-NEXT: s_mul_i32 s0, s2, s8 -; GCN-NEXT: s_sub_u32 s11, 24, s0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s12, s0, s1 -; GCN-NEXT: s_cmp_lg_u32 s12, 0 -; GCN-NEXT: s_subb_u32 s10, s10, s3 -; GCN-NEXT: s_sub_u32 s13, s11, s2 +; GCN-NEXT: s_add_i32 s11, s1, s0 +; GCN-NEXT: s_sub_i32 s8, 0, s11 +; GCN-NEXT: s_mul_i32 s0, s2, s10 +; GCN-NEXT: s_sub_u32 s12, 24, s0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: s_or_b32 s9, s0, s1 +; GCN-NEXT: s_subb_u32 s13, s8, s3 +; GCN-NEXT: s_sub_u32 s14, s12, s2 +; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GCN-NEXT: s_or_b32 s8, s8, s9 +; GCN-NEXT: s_subb_u32 s8, s13, 0 +; GCN-NEXT: s_cmp_ge_u32 s8, s3 +; GCN-NEXT: s_cselect_b32 s9, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s14, s2 +; GCN-NEXT: s_cselect_b32 s13, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, s3 +; GCN-NEXT: s_cselect_b32 s8, s13, s9 +; GCN-NEXT: s_add_u32 s9, s10, 1 +; GCN-NEXT: s_addc_u32 s13, 0, 0 +; GCN-NEXT: s_add_u32 s14, s10, 2 +; GCN-NEXT: s_addc_u32 s15, 0, 0 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: s_cselect_b32 s8, s14, s9 +; GCN-NEXT: s_cselect_b32 s9, s15, s13 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_subb_u32 s0, s10, 0 +; GCN-NEXT: s_subb_u32 s0, 0, s11 ; GCN-NEXT: s_cmp_ge_u32 s0, s3 ; GCN-NEXT: s_cselect_b32 s1, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s13, s2 -; GCN-NEXT: s_cselect_b32 s10, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s12, s2 +; GCN-NEXT: s_cselect_b32 s2, -1, 0 ; GCN-NEXT: s_cmp_eq_u32 s0, s3 -; GCN-NEXT: s_cselect_b32 s0, s10, s1 -; GCN-NEXT: s_add_u32 s1, s8, 1 -; GCN-NEXT: s_addc_u32 s10, 0, 0 -; GCN-NEXT: s_add_u32 s13, s8, 2 -; GCN-NEXT: s_addc_u32 s14, 0, 0 +; GCN-NEXT: s_cselect_b32 s0, s2, s1 ; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_cselect_b32 s0, s13, s1 -; GCN-NEXT: s_cselect_b32 s1, s14, s10 -; GCN-NEXT: s_cmp_lg_u32 s12, 0 -; GCN-NEXT: s_subb_u32 s9, 0, s9 -; GCN-NEXT: s_cmp_ge_u32 s9, s3 -; GCN-NEXT: s_cselect_b32 s10, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s11, s2 -; GCN-NEXT: s_cselect_b32 s2, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s9, s3 -; GCN-NEXT: s_cselect_b32 s2, s2, s10 -; GCN-NEXT: s_cmp_lg_u32 s2, 0 -; GCN-NEXT: s_cselect_b32 s1, s1, 0 -; GCN-NEXT: s_cselect_b32 s0, s0, s8 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: s_cselect_b32 s0, s9, 0 +; GCN-NEXT: s_cselect_b32 s1, s8, s10 +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; @@ -945,7 +939,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s10, s8, 1 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_or_b32 s6, s6, s7 -; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-IR-NEXT: s_addc_u32 s6, s9, 0 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 @@ -978,7 +971,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s14, s14, 1 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_or_b32 s16, s16, s17 -; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0 ; GCN-IR-NEXT: s_addc_u32 s15, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5] @@ -1317,7 +1309,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s11, s8, 1 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_or_b32 s6, s6, s7 -; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-IR-NEXT: s_addc_u32 s6, s9, 0 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 @@ -1347,7 +1338,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s10, s10, 1 ; GCN-IR-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GCN-IR-NEXT: s_or_b32 s12, s12, s13 -; GCN-IR-NEXT: s_cmp_lg_u32 s12, 0 ; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 ; GCN-IR-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll index 9bcba6c..2d7ce10 100644 --- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s define amdgpu_kernel void @uitofp_i16_to_f16( ; SI-LABEL: uitofp_i16_to_f16: diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index cdcc914..137dc1f 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -51,10 +51,9 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-NEXT: s_addc_u32 s13, 0, s14 ; GCN-NEXT: s_add_u32 s14, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_addc_u32 s12, s12, s13 ; GCN-NEXT: s_mul_i32 s0, s10, s12 ; GCN-NEXT: v_readfirstlane_b32 s1, v0 @@ -85,7 +84,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-NEXT: s_add_u32 s11, s14, s0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_addc_u32 s1, s12, s10 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 @@ -115,46 +113,43 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-NEXT: v_readfirstlane_b32 s10, v0 ; GCN-NEXT: s_add_i32 s5, s10, s5 ; GCN-NEXT: s_mul_i32 s10, s9, s4 -; GCN-NEXT: s_add_i32 s10, s5, s10 -; GCN-NEXT: s_sub_i32 s11, s7, s10 +; GCN-NEXT: s_add_i32 s12, s5, s10 +; GCN-NEXT: s_sub_i32 s10, s7, s12 ; GCN-NEXT: s_mul_i32 s4, s8, s4 ; GCN-NEXT: s_sub_u32 s6, s6, s4 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s12, s4, s5 -; GCN-NEXT: s_cmp_lg_u32 s12, 0 -; GCN-NEXT: s_subb_u32 s11, s11, s9 -; GCN-NEXT: s_sub_u32 s13, s6, s8 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: s_or_b32 s11, s4, s5 +; GCN-NEXT: s_subb_u32 s13, s10, s9 +; GCN-NEXT: s_sub_u32 s14, s6, s8 +; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GCN-NEXT: s_or_b32 s15, s10, s11 +; GCN-NEXT: s_subb_u32 s15, s13, 0 +; GCN-NEXT: s_cmp_ge_u32 s15, s9 +; GCN-NEXT: s_cselect_b32 s16, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s14, s8 +; GCN-NEXT: s_cselect_b32 s17, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s15, s9 +; GCN-NEXT: s_cselect_b32 s16, s17, s16 +; GCN-NEXT: s_or_b32 s10, s10, s11 +; GCN-NEXT: s_subb_u32 s13, s13, s9 +; GCN-NEXT: s_sub_u32 s17, s14, s8 +; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GCN-NEXT: s_or_b32 s10, s10, s11 +; GCN-NEXT: s_subb_u32 s10, s13, 0 +; GCN-NEXT: s_cmp_lg_u32 s16, 0 +; GCN-NEXT: s_cselect_b32 s11, s17, s14 +; GCN-NEXT: s_cselect_b32 s10, s10, s15 ; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_subb_u32 s14, s11, 0 -; GCN-NEXT: s_cmp_ge_u32 s14, s9 +; GCN-NEXT: s_subb_u32 s4, s7, s12 +; GCN-NEXT: s_cmp_ge_u32 s4, s9 ; GCN-NEXT: s_cselect_b32 s5, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s13, s8 -; GCN-NEXT: s_cselect_b32 s15, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s14, s9 -; GCN-NEXT: s_cselect_b32 s15, s15, s5 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_subb_u32 s11, s11, s9 -; GCN-NEXT: s_sub_u32 s16, s13, s8 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_subb_u32 s4, s11, 0 -; GCN-NEXT: s_cmp_lg_u32 s15, 0 -; GCN-NEXT: s_cselect_b32 s5, s16, s13 -; GCN-NEXT: s_cselect_b32 s4, s4, s14 -; GCN-NEXT: s_cmp_lg_u32 s12, 0 -; GCN-NEXT: s_subb_u32 s7, s7, s10 -; GCN-NEXT: s_cmp_ge_u32 s7, s9 -; GCN-NEXT: s_cselect_b32 s10, -1, 0 ; GCN-NEXT: s_cmp_ge_u32 s6, s8 -; GCN-NEXT: s_cselect_b32 s8, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s7, s9 -; GCN-NEXT: s_cselect_b32 s8, s8, s10 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 -; GCN-NEXT: s_cselect_b32 s4, s4, s7 -; GCN-NEXT: s_cselect_b32 s5, s5, s6 +; GCN-NEXT: s_cselect_b32 s7, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s4, s9 +; GCN-NEXT: s_cselect_b32 s5, s7, s5 +; GCN-NEXT: s_cmp_lg_u32 s5, 0 +; GCN-NEXT: s_cselect_b32 s4, s10, s4 +; GCN-NEXT: s_cselect_b32 s5, s11, s6 ; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -187,7 +182,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_add_u32 s14, s12, 1 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_or_b32 s8, s8, s9 -; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-IR-NEXT: s_addc_u32 s8, s13, 0 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 @@ -221,7 +215,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_add_u32 s16, s16, 1 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_or_b32 s18, s18, s19 -; GCN-IR-NEXT: s_cmp_lg_u32 s18, 0 ; GCN-IR-NEXT: s_addc_u32 s17, s17, 0 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] @@ -853,10 +846,9 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_addc_u32 s10, 0, s11 ; GCN-NEXT: s_add_u32 s11, s4, s5 ; GCN-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: s_addc_u32 s9, s9, s10 ; GCN-NEXT: s_mul_i32 s4, s6, s9 ; GCN-NEXT: v_readfirstlane_b32 s5, v0 @@ -887,7 +879,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_add_u32 s8, s11, s4 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: s_addc_u32 s4, s9, s6 ; GCN-NEXT: v_mul_hi_u32 v1, s8, 24 ; GCN-NEXT: v_mul_hi_u32 v0, s4, 24 @@ -903,46 +894,43 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_mul_i32 s0, s3, s8 ; GCN-NEXT: v_readfirstlane_b32 s1, v0 -; GCN-NEXT: s_add_i32 s9, s1, s0 -; GCN-NEXT: s_sub_i32 s10, 0, s9 +; GCN-NEXT: s_add_i32 s10, s1, s0 +; GCN-NEXT: s_sub_i32 s9, 0, s10 ; GCN-NEXT: s_mul_i32 s0, s2, s8 -; GCN-NEXT: s_sub_u32 s8, 24, s0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s11, s0, s1 -; GCN-NEXT: s_cmp_lg_u32 s11, 0 -; GCN-NEXT: s_subb_u32 s10, s10, s3 -; GCN-NEXT: s_sub_u32 s12, s8, s2 +; GCN-NEXT: s_sub_u32 s11, 24, s0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: s_or_b32 s8, s0, s1 +; GCN-NEXT: s_subb_u32 s12, s9, s3 +; GCN-NEXT: s_sub_u32 s13, s11, s2 +; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GCN-NEXT: s_or_b32 s14, s8, s9 +; GCN-NEXT: s_subb_u32 s14, s12, 0 +; GCN-NEXT: s_cmp_ge_u32 s14, s3 +; GCN-NEXT: s_cselect_b32 s15, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s13, s2 +; GCN-NEXT: s_cselect_b32 s16, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s14, s3 +; GCN-NEXT: s_cselect_b32 s15, s16, s15 +; GCN-NEXT: s_or_b32 s8, s8, s9 +; GCN-NEXT: s_subb_u32 s12, s12, s3 +; GCN-NEXT: s_sub_u32 s16, s13, s2 +; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GCN-NEXT: s_or_b32 s8, s8, s9 +; GCN-NEXT: s_subb_u32 s8, s12, 0 +; GCN-NEXT: s_cmp_lg_u32 s15, 0 +; GCN-NEXT: s_cselect_b32 s9, s16, s13 +; GCN-NEXT: s_cselect_b32 s8, s8, s14 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_subb_u32 s13, s10, 0 -; GCN-NEXT: s_cmp_ge_u32 s13, s3 +; GCN-NEXT: s_subb_u32 s0, 0, s10 +; GCN-NEXT: s_cmp_ge_u32 s0, s3 ; GCN-NEXT: s_cselect_b32 s1, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s12, s2 -; GCN-NEXT: s_cselect_b32 s14, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s13, s3 -; GCN-NEXT: s_cselect_b32 s14, s14, s1 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_subb_u32 s10, s10, s3 -; GCN-NEXT: s_sub_u32 s15, s12, s2 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_subb_u32 s0, s10, 0 -; GCN-NEXT: s_cmp_lg_u32 s14, 0 -; GCN-NEXT: s_cselect_b32 s1, s15, s12 -; GCN-NEXT: s_cselect_b32 s0, s0, s13 -; GCN-NEXT: s_cmp_lg_u32 s11, 0 -; GCN-NEXT: s_subb_u32 s9, 0, s9 -; GCN-NEXT: s_cmp_ge_u32 s9, s3 -; GCN-NEXT: s_cselect_b32 s10, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s8, s2 +; GCN-NEXT: s_cmp_ge_u32 s11, s2 ; GCN-NEXT: s_cselect_b32 s2, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s9, s3 -; GCN-NEXT: s_cselect_b32 s2, s2, s10 -; GCN-NEXT: s_cmp_lg_u32 s2, 0 -; GCN-NEXT: s_cselect_b32 s0, s0, s9 -; GCN-NEXT: s_cselect_b32 s1, s1, s8 +; GCN-NEXT: s_cmp_eq_u32 s0, s3 +; GCN-NEXT: s_cselect_b32 s1, s2, s1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s8, s0 +; GCN-NEXT: s_cselect_b32 s1, s9, s11 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -970,7 +958,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s10, s8, 1 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_or_b32 s6, s6, s7 -; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-IR-NEXT: s_addc_u32 s6, s9, 0 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 @@ -1003,7 +990,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s14, s14, 1 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_or_b32 s16, s16, s17 -; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0 ; GCN-IR-NEXT: s_addc_u32 s15, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5] @@ -1093,7 +1079,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s11, s8, 1 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_or_b32 s6, s6, s7 -; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-IR-NEXT: s_addc_u32 s6, s9, 0 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 @@ -1123,7 +1108,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s12, s12, 1 ; GCN-IR-NEXT: s_cselect_b64 s[14:15], -1, 0 ; GCN-IR-NEXT: s_or_b32 s14, s14, s15 -; GCN-IR-NEXT: s_cmp_lg_u32 s14, 0 ; GCN-IR-NEXT: s_addc_u32 s13, s13, 0 ; GCN-IR-NEXT: s_cselect_b64 s[14:15], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll index d67a7b1..e8db647 100644 --- a/llvm/test/CodeGen/AMDGPU/usubo.ll +++ b/llvm/test/CodeGen/AMDGPU/usubo.ll @@ -18,7 +18,6 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: s_or_b32 s0, s0, s1 -; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_subb_u32 s3, s3, s9 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -35,10 +34,8 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_sub_u32 s2, s2, s4 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 ; VI-NEXT: s_subb_u32 s3, s3, s5 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -53,14 +50,12 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_u32 s6, s2, s6 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_subb_u32 s4, s3, s7 +; GFX9-NEXT: s_sub_u32 s4, s2, s6 +; GFX9-NEXT: s_subb_u32 s5, s3, s7 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm @@ -73,8 +68,6 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sub_u32 s2, s2, s6 -; GFX10-NEXT: s_cselect_b32 s4, -1, 0 -; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: s_subb_u32 s3, s3, s7 ; GFX10-NEXT: s_cselect_b32 s4, -1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 @@ -91,14 +84,12 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_u32 s2, s2, s4 -; GFX11-NEXT: s_cselect_b32 s4, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_cmp_lg_u32 s4, 0 ; GFX11-NEXT: s_subb_u32 s3, s3, s5 ; GFX11-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, s2, s2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm @@ -443,7 +434,6 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_sub_u32 s4, s4, s6 ; SI-NEXT: s_cselect_b64 s[12:13], -1, 0 ; SI-NEXT: s_or_b32 s6, s12, s13 -; SI-NEXT: s_cmp_lg_u32 s6, 0 ; SI-NEXT: s_subb_u32 s5, s5, s7 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 @@ -464,16 +454,14 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: s_sub_u32 s2, s4, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_sub_u32 s0, s4, s6 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_subb_u32 s1, s5, s7 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 -; VI-NEXT: s_subb_u32 s0, s5, s7 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: v_mov_b32_e32 v5, s0 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -485,12 +473,10 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_u32 s2, s12, s14 -; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_subb_u32 s0, s13, s15 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_sub_u32 s0, s12, s14 +; GFX9-NEXT: s_subb_u32 s1, s13, s15 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -503,10 +489,8 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sub_u32 s0, s12, s14 -; GFX10-NEXT: s_cselect_b32 s1, -1, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10-NEXT: s_subb_u32 s1, s13, s15 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_cselect_b32 s0, -1, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 @@ -519,10 +503,8 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_u32 s4, s4, s6 -; GFX11-NEXT: s_cselect_b32 s6, -1, 0 -; GFX11-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-NEXT: s_cmp_lg_u32 s6, 0 ; GFX11-NEXT: s_subb_u32 s5, s5, s7 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-NEXT: s_cselect_b32 s4, -1, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index 75db387..28c6b40 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -774,44 +774,40 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: s_add_u32 s11, s12, s11 ; GFX1032-NEXT: s_addc_u32 s12, 0, s13 ; GFX1032-NEXT: s_add_u32 s8, s8, s11 -; GFX1032-NEXT: s_cselect_b32 s11, -1, 0 -; GFX1032-NEXT: s_mul_hi_u32 s13, s9, s8 -; GFX1032-NEXT: s_cmp_lg_u32 s11, 0 -; GFX1032-NEXT: s_mul_i32 s11, s9, s8 ; GFX1032-NEXT: s_addc_u32 s5, s5, s12 -; GFX1032-NEXT: s_mul_i32 s10, s10, s8 +; GFX1032-NEXT: s_mul_hi_u32 s11, s9, s8 +; GFX1032-NEXT: s_mul_i32 s12, s9, s8 ; GFX1032-NEXT: s_mul_i32 s9, s9, s5 -; GFX1032-NEXT: s_mul_hi_u32 s12, s8, s11 -; GFX1032-NEXT: s_add_i32 s9, s13, s9 -; GFX1032-NEXT: s_mul_hi_u32 s13, s5, s11 +; GFX1032-NEXT: s_mul_i32 s10, s10, s8 +; GFX1032-NEXT: s_add_i32 s9, s11, s9 +; GFX1032-NEXT: s_mul_i32 s11, s5, s12 ; GFX1032-NEXT: s_add_i32 s9, s9, s10 -; GFX1032-NEXT: s_mul_i32 s10, s5, s11 +; GFX1032-NEXT: s_mul_hi_u32 s10, s8, s12 ; GFX1032-NEXT: s_mul_i32 s15, s8, s9 ; GFX1032-NEXT: s_mul_hi_u32 s14, s8, s9 -; GFX1032-NEXT: s_add_u32 s12, s12, s15 +; GFX1032-NEXT: s_add_u32 s10, s10, s15 +; GFX1032-NEXT: s_mul_hi_u32 s13, s5, s12 ; GFX1032-NEXT: s_addc_u32 s14, 0, s14 -; GFX1032-NEXT: s_mul_hi_u32 s11, s5, s9 -; GFX1032-NEXT: s_add_u32 s10, s12, s10 +; GFX1032-NEXT: s_mul_hi_u32 s12, s5, s9 +; GFX1032-NEXT: s_add_u32 s10, s10, s11 ; GFX1032-NEXT: s_mul_i32 s9, s5, s9 ; GFX1032-NEXT: s_addc_u32 s10, s14, s13 -; GFX1032-NEXT: s_addc_u32 s11, s11, 0 +; GFX1032-NEXT: s_addc_u32 s11, s12, 0 ; GFX1032-NEXT: s_add_u32 s9, s10, s9 ; GFX1032-NEXT: s_addc_u32 s10, 0, s11 ; GFX1032-NEXT: s_add_u32 s8, s8, s9 -; GFX1032-NEXT: s_cselect_b32 s9, -1, 0 -; GFX1032-NEXT: s_mul_hi_u32 s11, s2, s8 -; GFX1032-NEXT: s_cmp_lg_u32 s9, 0 -; GFX1032-NEXT: s_mul_hi_u32 s9, s3, s8 ; GFX1032-NEXT: s_addc_u32 s5, s5, s10 -; GFX1032-NEXT: s_mul_i32 s8, s3, s8 +; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s8 ; GFX1032-NEXT: s_mul_i32 s12, s2, s5 -; GFX1032-NEXT: s_mul_hi_u32 s10, s2, s5 -; GFX1032-NEXT: s_add_u32 s11, s11, s12 -; GFX1032-NEXT: s_addc_u32 s10, 0, s10 +; GFX1032-NEXT: s_mul_hi_u32 s11, s2, s5 +; GFX1032-NEXT: s_mul_hi_u32 s10, s3, s8 +; GFX1032-NEXT: s_mul_i32 s8, s3, s8 +; GFX1032-NEXT: s_add_u32 s9, s9, s12 +; GFX1032-NEXT: s_addc_u32 s11, 0, s11 ; GFX1032-NEXT: s_mul_hi_u32 s13, s3, s5 -; GFX1032-NEXT: s_add_u32 s8, s11, s8 +; GFX1032-NEXT: s_add_u32 s8, s9, s8 ; GFX1032-NEXT: s_mul_i32 s5, s3, s5 -; GFX1032-NEXT: s_addc_u32 s8, s10, s9 +; GFX1032-NEXT: s_addc_u32 s8, s11, s10 ; GFX1032-NEXT: s_addc_u32 s9, s13, 0 ; GFX1032-NEXT: s_add_u32 s5, s8, s5 ; GFX1032-NEXT: s_addc_u32 s8, 0, s9 @@ -824,11 +820,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: s_sub_i32 s11, s3, s9 ; GFX1032-NEXT: s_sub_u32 s10, s2, s10 ; GFX1032-NEXT: s_cselect_b32 s12, -1, 0 -; GFX1032-NEXT: s_cmp_lg_u32 s12, 0 ; GFX1032-NEXT: s_subb_u32 s11, s11, s1 ; GFX1032-NEXT: s_sub_u32 s13, s10, s0 -; GFX1032-NEXT: s_cselect_b32 s14, -1, 0 -; GFX1032-NEXT: s_cmp_lg_u32 s14, 0 ; GFX1032-NEXT: s_subb_u32 s11, s11, 0 ; GFX1032-NEXT: s_cmp_ge_u32 s11, s1 ; GFX1032-NEXT: s_cselect_b32 s14, -1, 0 @@ -901,8 +894,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX1064-NEXT: v_cvt_f32_u32_e32 v1, s1 -; GFX1064-NEXT: s_sub_u32 s9, 0, s0 -; GFX1064-NEXT: s_subb_u32 s10, 0, s1 +; GFX1064-NEXT: s_sub_u32 s8, 0, s0 +; GFX1064-NEXT: s_subb_u32 s9, 0, s1 ; GFX1064-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX1064-NEXT: v_rcp_f32_e32 v0, v0 ; GFX1064-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -911,109 +904,102 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1064-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GFX1064-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX1064-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX1064-NEXT: v_readfirstlane_b32 s8, v1 -; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 -; GFX1064-NEXT: s_mul_i32 s5, s9, s8 -; GFX1064-NEXT: s_mul_hi_u32 s12, s9, s4 -; GFX1064-NEXT: s_mul_i32 s11, s10, s4 -; GFX1064-NEXT: s_add_i32 s5, s12, s5 -; GFX1064-NEXT: s_mul_i32 s13, s9, s4 -; GFX1064-NEXT: s_add_i32 s5, s5, s11 -; GFX1064-NEXT: s_mul_hi_u32 s12, s4, s13 -; GFX1064-NEXT: s_mul_i32 s15, s4, s5 -; GFX1064-NEXT: s_mul_hi_u32 s14, s8, s13 -; GFX1064-NEXT: s_mul_i32 s11, s8, s13 -; GFX1064-NEXT: s_mul_hi_u32 s13, s4, s5 +; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 +; GFX1064-NEXT: v_readfirstlane_b32 s5, v0 +; GFX1064-NEXT: s_mul_i32 s10, s8, s4 +; GFX1064-NEXT: s_mul_hi_u32 s12, s8, s5 +; GFX1064-NEXT: s_mul_i32 s11, s9, s5 +; GFX1064-NEXT: s_add_i32 s10, s12, s10 +; GFX1064-NEXT: s_mul_i32 s13, s8, s5 +; GFX1064-NEXT: s_add_i32 s10, s10, s11 +; GFX1064-NEXT: s_mul_hi_u32 s12, s5, s13 +; GFX1064-NEXT: s_mul_i32 s15, s5, s10 +; GFX1064-NEXT: s_mul_hi_u32 s14, s4, s13 +; GFX1064-NEXT: s_mul_i32 s11, s4, s13 +; GFX1064-NEXT: s_mul_hi_u32 s13, s5, s10 ; GFX1064-NEXT: s_add_u32 s12, s12, s15 ; GFX1064-NEXT: s_addc_u32 s13, 0, s13 -; GFX1064-NEXT: s_mul_hi_u32 s16, s8, s5 +; GFX1064-NEXT: s_mul_hi_u32 s16, s4, s10 ; GFX1064-NEXT: s_add_u32 s11, s12, s11 -; GFX1064-NEXT: s_mul_i32 s5, s8, s5 +; GFX1064-NEXT: s_mul_i32 s10, s4, s10 ; GFX1064-NEXT: s_addc_u32 s11, s13, s14 ; GFX1064-NEXT: s_addc_u32 s12, s16, 0 -; GFX1064-NEXT: s_add_u32 s5, s11, s5 +; GFX1064-NEXT: s_add_u32 s10, s11, s10 ; GFX1064-NEXT: s_addc_u32 s11, 0, s12 -; GFX1064-NEXT: s_add_u32 s12, s4, s5 -; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX1064-NEXT: s_mul_hi_u32 s13, s9, s12 -; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX1064-NEXT: s_mul_i32 s4, s9, s12 -; GFX1064-NEXT: s_addc_u32 s8, s8, s11 -; GFX1064-NEXT: s_mul_i32 s10, s10, s12 -; GFX1064-NEXT: s_mul_i32 s9, s9, s8 -; GFX1064-NEXT: s_mul_hi_u32 s5, s12, s4 -; GFX1064-NEXT: s_add_i32 s9, s13, s9 -; GFX1064-NEXT: s_mul_hi_u32 s11, s8, s4 -; GFX1064-NEXT: s_add_i32 s9, s9, s10 -; GFX1064-NEXT: s_mul_i32 s4, s8, s4 -; GFX1064-NEXT: s_mul_i32 s14, s12, s9 -; GFX1064-NEXT: s_mul_hi_u32 s13, s12, s9 -; GFX1064-NEXT: s_add_u32 s5, s5, s14 +; GFX1064-NEXT: s_add_u32 s5, s5, s10 +; GFX1064-NEXT: s_addc_u32 s4, s4, s11 +; GFX1064-NEXT: s_mul_hi_u32 s10, s8, s5 +; GFX1064-NEXT: s_mul_i32 s11, s8, s5 +; GFX1064-NEXT: s_mul_i32 s8, s8, s4 +; GFX1064-NEXT: s_mul_i32 s9, s9, s5 +; GFX1064-NEXT: s_add_i32 s8, s10, s8 +; GFX1064-NEXT: s_mul_i32 s10, s4, s11 +; GFX1064-NEXT: s_add_i32 s8, s8, s9 +; GFX1064-NEXT: s_mul_hi_u32 s9, s5, s11 +; GFX1064-NEXT: s_mul_i32 s14, s5, s8 +; GFX1064-NEXT: s_mul_hi_u32 s13, s5, s8 +; GFX1064-NEXT: s_add_u32 s9, s9, s14 +; GFX1064-NEXT: s_mul_hi_u32 s12, s4, s11 ; GFX1064-NEXT: s_addc_u32 s13, 0, s13 -; GFX1064-NEXT: s_mul_hi_u32 s10, s8, s9 -; GFX1064-NEXT: s_add_u32 s4, s5, s4 -; GFX1064-NEXT: s_mul_i32 s9, s8, s9 -; GFX1064-NEXT: s_addc_u32 s4, s13, s11 -; GFX1064-NEXT: s_addc_u32 s5, s10, 0 -; GFX1064-NEXT: s_add_u32 s4, s4, s9 -; GFX1064-NEXT: s_addc_u32 s9, 0, s5 -; GFX1064-NEXT: s_add_u32 s10, s12, s4 -; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX1064-NEXT: s_mul_hi_u32 s11, s2, s10 -; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX1064-NEXT: s_mul_hi_u32 s4, s3, s10 -; GFX1064-NEXT: s_addc_u32 s5, s8, s9 -; GFX1064-NEXT: s_mul_i32 s8, s3, s10 -; GFX1064-NEXT: s_mul_i32 s10, s2, s5 -; GFX1064-NEXT: s_mul_hi_u32 s9, s2, s5 -; GFX1064-NEXT: s_add_u32 s10, s11, s10 -; GFX1064-NEXT: s_addc_u32 s9, 0, s9 -; GFX1064-NEXT: s_mul_hi_u32 s12, s3, s5 -; GFX1064-NEXT: s_add_u32 s8, s10, s8 +; GFX1064-NEXT: s_mul_hi_u32 s11, s4, s8 +; GFX1064-NEXT: s_add_u32 s9, s9, s10 +; GFX1064-NEXT: s_mul_i32 s8, s4, s8 +; GFX1064-NEXT: s_addc_u32 s9, s13, s12 +; GFX1064-NEXT: s_addc_u32 s10, s11, 0 +; GFX1064-NEXT: s_add_u32 s8, s9, s8 +; GFX1064-NEXT: s_addc_u32 s9, 0, s10 +; GFX1064-NEXT: s_add_u32 s5, s5, s8 +; GFX1064-NEXT: s_addc_u32 s4, s4, s9 +; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s5 +; GFX1064-NEXT: s_mul_i32 s11, s2, s4 +; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s4 +; GFX1064-NEXT: s_mul_hi_u32 s9, s3, s5 ; GFX1064-NEXT: s_mul_i32 s5, s3, s5 -; GFX1064-NEXT: s_addc_u32 s4, s9, s4 +; GFX1064-NEXT: s_add_u32 s8, s8, s11 +; GFX1064-NEXT: s_addc_u32 s10, 0, s10 +; GFX1064-NEXT: s_mul_hi_u32 s12, s3, s4 +; GFX1064-NEXT: s_add_u32 s5, s8, s5 +; GFX1064-NEXT: s_mul_i32 s4, s3, s4 +; GFX1064-NEXT: s_addc_u32 s5, s10, s9 ; GFX1064-NEXT: s_addc_u32 s8, s12, 0 -; GFX1064-NEXT: s_add_u32 s10, s4, s5 +; GFX1064-NEXT: s_add_u32 s10, s5, s4 ; GFX1064-NEXT: s_addc_u32 s11, 0, s8 ; GFX1064-NEXT: s_mul_hi_u32 s4, s0, s10 ; GFX1064-NEXT: s_mul_i32 s5, s0, s11 ; GFX1064-NEXT: s_mul_i32 s8, s1, s10 ; GFX1064-NEXT: s_add_i32 s4, s4, s5 -; GFX1064-NEXT: s_add_i32 s12, s4, s8 +; GFX1064-NEXT: s_add_i32 s8, s4, s8 ; GFX1064-NEXT: s_mul_i32 s4, s0, s10 -; GFX1064-NEXT: s_sub_i32 s8, s3, s12 -; GFX1064-NEXT: s_sub_u32 s13, s2, s4 +; GFX1064-NEXT: s_sub_i32 s9, s3, s8 +; GFX1064-NEXT: s_sub_u32 s12, s2, s4 ; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX1064-NEXT: s_subb_u32 s14, s8, s1 -; GFX1064-NEXT: s_sub_u32 s15, s13, s0 -; GFX1064-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX1064-NEXT: s_cmp_lg_u64 s[8:9], 0 -; GFX1064-NEXT: s_subb_u32 s8, s14, 0 -; GFX1064-NEXT: s_cmp_ge_u32 s8, s1 -; GFX1064-NEXT: s_cselect_b32 s9, -1, 0 -; GFX1064-NEXT: s_cmp_ge_u32 s15, s0 +; GFX1064-NEXT: s_subb_u32 s9, s9, s1 +; GFX1064-NEXT: s_sub_u32 s13, s12, s0 +; GFX1064-NEXT: s_subb_u32 s9, s9, 0 +; GFX1064-NEXT: s_cmp_ge_u32 s9, s1 ; GFX1064-NEXT: s_cselect_b32 s14, -1, 0 -; GFX1064-NEXT: s_cmp_eq_u32 s8, s1 -; GFX1064-NEXT: s_cselect_b32 s8, s14, s9 -; GFX1064-NEXT: s_add_u32 s9, s10, 1 +; GFX1064-NEXT: s_cmp_ge_u32 s13, s0 +; GFX1064-NEXT: s_cselect_b32 s13, -1, 0 +; GFX1064-NEXT: s_cmp_eq_u32 s9, s1 +; GFX1064-NEXT: s_cselect_b32 s9, s13, s14 +; GFX1064-NEXT: s_add_u32 s13, s10, 1 ; GFX1064-NEXT: s_addc_u32 s14, s11, 0 ; GFX1064-NEXT: s_add_u32 s15, s10, 2 ; GFX1064-NEXT: s_addc_u32 s16, s11, 0 -; GFX1064-NEXT: s_cmp_lg_u32 s8, 0 -; GFX1064-NEXT: s_cselect_b32 s15, s15, s9 +; GFX1064-NEXT: s_cmp_lg_u32 s9, 0 +; GFX1064-NEXT: s_cselect_b32 s13, s15, s13 ; GFX1064-NEXT: s_cselect_b32 s14, s16, s14 ; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX1064-NEXT: s_subb_u32 s3, s3, s12 +; GFX1064-NEXT: s_subb_u32 s3, s3, s8 ; GFX1064-NEXT: s_cmp_ge_u32 s3, s1 ; GFX1064-NEXT: s_cselect_b32 s4, -1, 0 -; GFX1064-NEXT: s_cmp_ge_u32 s13, s0 +; GFX1064-NEXT: s_cmp_ge_u32 s12, s0 ; GFX1064-NEXT: s_cselect_b32 s5, -1, 0 ; GFX1064-NEXT: s_cmp_eq_u32 s3, s1 ; GFX1064-NEXT: s_cselect_b32 s1, s5, s4 ; GFX1064-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1064-NEXT: s_cselect_b32 s5, s14, s11 -; GFX1064-NEXT: s_cselect_b32 s4, s15, s10 +; GFX1064-NEXT: s_cselect_b32 s4, s13, s10 ; GFX1064-NEXT: s_cbranch_execnz .LBB15_3 ; GFX1064-NEXT: .LBB15_2: ; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll index 64d055b..4445383 100644 --- a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll +++ b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll @@ -271,7 +271,6 @@ define i1 @workgroup_nonzero() { ; DAGISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL-GFX8-NEXT: s_or_b32 s4, s12, s13 ; DAGISEL-GFX8-NEXT: s_or_b32 s4, s4, s14 -; DAGISEL-GFX8-NEXT: s_cmp_lg_u32 s4, 0 ; DAGISEL-GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 ; DAGISEL-GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; DAGISEL-GFX8-NEXT: s_setpc_b64 s[30:31] @@ -281,7 +280,6 @@ define i1 @workgroup_nonzero() { ; DAGISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL-GFX942-NEXT: s_or_b32 s0, s12, s13 ; DAGISEL-GFX942-NEXT: s_or_b32 s0, s0, s14 -; DAGISEL-GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; DAGISEL-GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 ; DAGISEL-GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; DAGISEL-GFX942-NEXT: s_setpc_b64 s[30:31] @@ -299,8 +297,6 @@ define i1 @workgroup_nonzero() { ; DAGISEL-GFX12-NEXT: s_or_b32 s0, ttmp9, s0 ; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe ; DAGISEL-GFX12-NEXT: s_or_b32 s0, s0, s1 -; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe -; DAGISEL-GFX12-NEXT: s_cmp_lg_u32 s0, 0 ; DAGISEL-GFX12-NEXT: s_cselect_b32 s0, -1, 0 ; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe ; DAGISEL-GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 @@ -311,7 +307,6 @@ define i1 @workgroup_nonzero() { ; GISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-GFX8-NEXT: s_or_b32 s4, s12, s13 ; GISEL-GFX8-NEXT: s_or_b32 s4, s4, s14 -; GISEL-GFX8-NEXT: s_cmp_lg_u32 s4, 0 ; GISEL-GFX8-NEXT: s_cselect_b32 s4, 1, 0 ; GISEL-GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GISEL-GFX8-NEXT: s_setpc_b64 s[30:31] @@ -321,7 +316,6 @@ define i1 @workgroup_nonzero() { ; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-GFX942-NEXT: s_or_b32 s0, s12, s13 ; GISEL-GFX942-NEXT: s_or_b32 s0, s0, s14 -; GISEL-GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GISEL-GFX942-NEXT: s_cselect_b32 s0, 1, 0 ; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-GFX942-NEXT: s_setpc_b64 s[30:31] @@ -339,8 +333,6 @@ define i1 @workgroup_nonzero() { ; GISEL-GFX12-NEXT: s_or_b32 s0, ttmp9, s0 ; GISEL-GFX12-NEXT: s_wait_alu 0xfffe ; GISEL-GFX12-NEXT: s_or_b32 s0, s0, s1 -; GISEL-GFX12-NEXT: s_wait_alu 0xfffe -; GISEL-GFX12-NEXT: s_cmp_lg_u32 s0, 0 ; GISEL-GFX12-NEXT: s_cselect_b32 s0, 1, 0 ; GISEL-GFX12-NEXT: s_wait_alu 0xfffe ; GISEL-GFX12-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll b/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll index 972a470..cabd43e 100644 --- a/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll +++ b/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll @@ -27,7 +27,7 @@ entry: !1 = !{i64 0, !"_ZTSFivE.generalized"} !2 = !{i64 0, !"_ZTSFviE.generalized"} -; CHECK: .section .llvm.callgraph,"o",%progbits,.text +; CHECK: .section .llvm.callgraph,"o",%llvm_call_graph,.text ;; Version ; CHECK-NEXT: .byte 0 ;; Flags -- Potential indirect target so LSB is set to 1. Other bits are 0. diff --git a/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll b/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll index ec8d5b8..3d3974e 100644 --- a/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll +++ b/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll @@ -36,7 +36,7 @@ entry: !4 = !{!5} !5 = !{i64 0, !"_ZTSFPvS_E.generalized"} -; CHECK: .section .llvm.callgraph,"o",%progbits,.text +; CHECK: .section .llvm.callgraph,"o",%llvm_call_graph,.text ;; Version ; CHECK-NEXT: .byte 0 ;; Flags diff --git a/llvm/test/CodeGen/BPF/BTF/ptr-named-2.ll b/llvm/test/CodeGen/BPF/BTF/ptr-named-2.ll new file mode 100644 index 0000000..df0cbeb --- /dev/null +++ b/llvm/test/CodeGen/BPF/BTF/ptr-named-2.ll @@ -0,0 +1,59 @@ +; RUN: llc -mtriple=bpfel -filetype=obj -o %t1 %s +; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1 +; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s +; RUN: llc -mtriple=bpfeb -filetype=obj -o %t1 %s +; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1 +; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s +; +; This IR is hand-written. + +; ModuleID = 'ptr-named-2.ll' +source_filename = "ptr-named-2.ll" +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "bpfel-unknown-none" + +%struct.TypeExamples = type { i32*, i32, i32, i32* } + +@type_examples = internal global %struct.TypeExamples zeroinitializer, align 8, !dbg !0 + +!llvm.dbg.cu = !{!1} +!llvm.module.flags = !{!2, !3, !4} +!llvm.ident = !{!21} + +; CHECK-BTF: [1] STRUCT 'TypeExamples' size=32 vlen=4 +; CHECK-BTF-NEXT: 'ptr' type_id=2 bits_offset=0 +; CHECK-BTF-NEXT: 'volatile' type_id=4 bits_offset=64 +; CHECK-BTF-NEXT: 'const' type_id=5 bits_offset=128 +; CHECK-BTF-NEXT: 'restrict_ptr' type_id=6 bits_offset=192 +; CHECK-BTF-NEXT: [2] PTR '(anon)' type_id=3 +; CHECK-BTF-NEXT: [3] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED +; CHECK-BTF-NEXT: [4] VOLATILE '(anon)' type_id=3 +; CHECK-BTF-NEXT: [5] CONST '(anon)' type_id=3 +; CHECK-BTF-NEXT: [6] RESTRICT '(anon)' type_id=7 +; CHECK-BTF-NEXT: [7] PTR '(anon)' type_id=3 +; CHECK-BTF-NEXT: [8] VAR 'type_examples' type_id=1, linkage=static +; CHECK-BTF-NEXT: [9] DATASEC '.bss' size=0 vlen=1 +; CHECK-BTF-NEXT: type_id=8 offset=0 size=24 + +!0 = !DIGlobalVariableExpression(var: !5, expr: !DIExpression()) +!1 = distinct !DICompileUnit(language: DW_LANG_C99, file: !6, isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !7, globals: !8, splitDebugInlining: false, nameTableKind: None) +!2 = !{i32 2, !"Dwarf Version", i32 4} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{i32 1, !"wchar_size", i32 4} +!5 = distinct !DIGlobalVariable(name: "type_examples", scope: !1, file: !6, line: 12, type: !9, isLocal: true, isDefinition: true) +!6 = !DIFile(filename: "ptr-named-2.ll", directory: "/tmp") +!7 = !{} +!8 = !{!0} +!9 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "TypeExamples", file: !6, line: 5, size: 256, elements: !10) +!10 = !{!11, !12, !13, !14} +!11 = !DIDerivedType(tag: DW_TAG_member, name: "ptr", scope: !9, file: !6, line: 6, baseType: !15, size: 64) +!12 = !DIDerivedType(tag: DW_TAG_member, name: "volatile", scope: !9, file: !6, line: 7, baseType: !17, size: 64, offset: 64) +!13 = !DIDerivedType(tag: DW_TAG_member, name: "const", scope: !9, file: !6, line: 8, baseType: !18, size: 64, offset: 128) +!14 = !DIDerivedType(tag: DW_TAG_member, name: "restrict_ptr", scope: !9, file: !6, line: 9, baseType: !19, size: 64, offset: 192) +!15 = !DIDerivedType(tag: DW_TAG_pointer_type, name: "*int", baseType: !16, size: 64) +!16 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!17 = !DIDerivedType(tag: DW_TAG_volatile_type, name: "volatile int", baseType: !16) +!18 = !DIDerivedType(tag: DW_TAG_const_type, name: "const int", baseType: !16) +!19 = !DIDerivedType(tag: DW_TAG_restrict_type, name: "*int restrict", baseType: !20) +!20 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !16, size: 64) +!21 = !{!"my hand-written IR"} diff --git a/llvm/test/CodeGen/BPF/BTF/ptr-named.ll b/llvm/test/CodeGen/BPF/BTF/ptr-named.ll new file mode 100644 index 0000000..675c34e --- /dev/null +++ b/llvm/test/CodeGen/BPF/BTF/ptr-named.ll @@ -0,0 +1,75 @@ +; RUN: llc -mtriple=bpfel -filetype=obj -o %t1 %s +; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1 +; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s +; RUN: llc -mtriple=bpfeb -filetype=obj -o %t1 %s +; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1 +; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s +; +; Source: +; #![no_std] +; #![no_main] +; +; pub struct MyType { +; ptr: *const u32, +; } +; +; impl MyType { +; pub const fn new() -> Self { +; let ptr = core::ptr::null(); +; Self { ptr } +; } +; } +; +; unsafe impl Sync for MyType {} +; +; #[unsafe(no_mangle)] +; pub static X: MyType = MyType::new(); +; +; #[cfg(not(test))] +; #[panic_handler] +; fn panic(_info: &core::panic::PanicInfo) -> ! { +; loop {} +; } +; Compilation flag: +; cargo +nightly rustc -Zbuild-std=core --target=bpfel-unknown-none -- --emit=llvm-bc +; llvm-extract --glob=X $(find target/ -name "*.bc" | head -n 1) -o ptr-named.bc +; llvm-dis ptr-named.bc -o ptr-named.ll + +; ModuleID = 'ptr-named.bc' +source_filename = "1m2uqe50qkwxmo53ydydvou91" +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "bpfel" + +@X = constant [8 x i8] zeroinitializer, align 8, !dbg !0 + +!llvm.module.flags = !{!11, !12, !13, !14} +!llvm.ident = !{!15} +!llvm.dbg.cu = !{!16} + +; CHECK-BTF: [1] STRUCT 'MyType' size=8 vlen=1 +; CHECK-BTF-NEXT: 'ptr' type_id=2 bits_offset=0 +; CHECK-BTF-NEXT: [2] PTR '(anon)' type_id=3 +; CHECK-BTF-NEXT: [3] INT 'u32' size=4 bits_offset=0 nr_bits=32 encoding=(none) +; CHECK-BTF-NEXT: [4] VAR 'X' type_id=1, linkage=global +; CHECK-BTF-NEXT: [5] DATASEC '.rodata' size=0 vlen=1 +; CHECK-BTF-NEXT: type_id=4 offset=0 size=8 + +!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression()) +!1 = distinct !DIGlobalVariable(name: "X", scope: !2, file: !3, line: 19, type: !4, isLocal: false, isDefinition: true, align: 64) +!2 = !DINamespace(name: "ptr_named", scope: null) +!3 = !DIFile(filename: "ptr-named/src/main.rs", directory: "/tmp/ptr-named", checksumkind: CSK_MD5, checksum: "e37168304600b30cbb5ba168f0384932") +!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "MyType", scope: !2, file: !5, size: 64, align: 64, flags: DIFlagPublic, elements: !6, templateParams: !10, identifier: "7609fa40332dd486922f074276a171c3") +!5 = !DIFile(filename: "<unknown>", directory: "") +!6 = !{!7} +!7 = !DIDerivedType(tag: DW_TAG_member, name: "ptr", scope: !4, file: !5, baseType: !8, size: 64, align: 64, flags: DIFlagPrivate) +!8 = !DIDerivedType(tag: DW_TAG_pointer_type, name: "*const u32", baseType: !9, size: 64, align: 64, dwarfAddressSpace: 0) +!9 = !DIBasicType(name: "u32", size: 32, encoding: DW_ATE_unsigned) +!10 = !{} +!11 = !{i32 8, !"PIC Level", i32 2} +!12 = !{i32 7, !"PIE Level", i32 2} +!13 = !{i32 7, !"Dwarf Version", i32 4} +!14 = !{i32 2, !"Debug Info Version", i32 3} +!15 = !{!"rustc version 1.92.0-nightly (c8905eaa6 2025-09-28)"} +!16 = distinct !DICompileUnit(language: DW_LANG_Rust, file: !17, producer: "clang LLVM (rustc version 1.92.0-nightly (c8905eaa6 2025-09-28))", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !18, splitDebugInlining: false, nameTableKind: None) +!17 = !DIFile(filename: "ptr-named/src/main.rs/@/1m2uqe50qkwxmo53ydydvou91", directory: "/tmp/ptr-named") +!18 = !{!0} diff --git a/llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll b/llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll index 4f13f47..56798c8 100644 --- a/llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll +++ b/llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll @@ -28,6 +28,11 @@ define void @test() { @llvm.dx.resource.handlefrombinding(i32 0, i32 10, i32 1, i32 0, ptr @SB.str) ; CHECK: %"StructuredBuffer<struct.S>" = type { %struct.S } + ; StructuredBuffer<float[3][2]> + %struct1 = call target("dx.RawBuffer", [3 x [2 x float]], 0, 0) + @llvm.dx.resource.handlefrombinding(i32 0, i32 12, i32 1, i32 0, ptr null) + ; CHECK: %"StructuredBuffer<float[3][2]>" = type { [3 x [2 x float]] } + ; ByteAddressBuffer %byteaddr = call target("dx.RawBuffer", i8, 0, 0) @llvm.dx.resource.handlefrombinding(i32 0, i32 20, i32 1, i32 0, ptr null) @@ -40,12 +45,14 @@ define void @test() { ; CHECK-NEXT: @[[T1:.*]] = external constant %"Buffer<int32_t>" ; CHECK-NEXT: @[[T2:.*]] = external constant %"Buffer<uint32_t3>" ; CHECK-NEXT: @[[S0:.*]] = external constant %"StructuredBuffer<struct.S>" +; CHECK-NEXT: @[[S1:.*]] = external constant %"StructuredBuffer<float[3][2]>" ; CHECK-NEXT: @[[B0:.*]] = external constant %ByteAddressBuffer ; CHECK: !{i32 0, ptr @[[T0]], !"A" ; CHECK: !{i32 1, ptr @[[T1]], !"" ; CHECK: !{i32 2, ptr @[[T2]], !"" ; CHECK: !{i32 3, ptr @[[S0]], !"SB" -; CHECK: !{i32 4, ptr @[[B0]], !"" +; CHECK: !{i32 4, ptr @[[S1]], !"" +; CHECK: !{i32 5, ptr @[[B0]], !"" attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) } diff --git a/llvm/test/CodeGen/Hexagon/swp-phi.ll b/llvm/test/CodeGen/Hexagon/swp-phi.ll index 9b0e126..6ce2481 100644 --- a/llvm/test/CodeGen/Hexagon/swp-phi.ll +++ b/llvm/test/CodeGen/Hexagon/swp-phi.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=hexagon -enable-unsafe-fp-math -enable-pipeliner \ +; RUN: llc -mtriple=hexagon -enable-pipeliner \ ; RUN: -pipeliner-prune-deps=false -stats -o /dev/null < %s ; REQUIRES: asserts diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_2D_vocab.json b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_2D_vocab.json index 2894fff..da0d13d 100644 --- a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_2D_vocab.json +++ b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_2D_vocab.json @@ -1,5 +1,5 @@ { - "entities" : { + "Opcodes" : { "ABS_Fp":[1, 2], "ADC":[3, 4], "ADD":[5, 6], @@ -7,5 +7,21 @@ "ADDPDrr":[9, 10], "ADDPSrr":[11, 12], "ADDSDrm":[13, 14] + }, + "CommonOperands": { + "Immediate": [0.1, 0.1], + "MBB": [0.2, 0.2], + "FrameIndex": [0.3, 0.3], + "GlobalAddress": [0.4, 0.4] + }, + "PhysicalRegisters": { + "GR32": [0.5, 0.5], + "GR64": [0.6, 0.6], + "XMM": [0.7, 0.7] + }, + "VirtualRegisters": { + "GR32": [0.8, 0.8], + "GR64": [0.9, 0.9], + "XMM": [1.0, 1.0] } }
\ No newline at end of file diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json index 5de715b..f4b14a4 100644 --- a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json +++ b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json @@ -1,5 +1,5 @@ { - "entities": { + "Opcodes": { "KILL": [0.1, 0.2, 0.3], "MOV": [0.4, 0.5, 0.6], "LEA": [0.7, 0.8, 0.9], @@ -18,5 +18,21 @@ "POP": [4.6, 4.7, 4.8], "NOP": [4.9, 5.0, 5.1], "COPY": [5.2, 5.3, 5.4] + }, + "CommonOperands": { + "Immediate": [0.1, 0.1, 0.1], + "MBB": [0.2, 0.2, 0.2], + "FrameIndex": [0.3, 0.3, 0.3], + "GlobalAddress": [0.4, 0.4, 0.4] + }, + "PhysicalRegisters": { + "GR32": [0.5, 0.5, 0.5], + "GR64": [0.6, 0.6, 0.6], + "XMM": [0.7, 0.7, 0.7] + }, + "VirtualRegisters": { + "GR32": [0.8, 0.8, 0.8], + "GR64": [0.9, 0.9, 0.9], + "XMM": [1.0, 1.0, 1.0] } }
\ No newline at end of file diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_inconsistent_dims.json b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_inconsistent_dims.json index bf04163..6274fb7 100644 --- a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_inconsistent_dims.json +++ b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_inconsistent_dims.json @@ -1,7 +1,16 @@ { - "entities": { + "Opcodes": { "ADD": [1.0, 2.0, 3.0], "SUB": [1.5], "MUL": [2.0, 3.0] + }, + "CommonOperands": { + "Immediate": [1.0] + }, + "PhysicalRegisters": { + "GR32": [1.0, 2.0] + }, + "VirtualRegisters": { + "GR32": [1.0, 2.0, 3.0] } } diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_zero_vocab.json b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_zero_vocab.json index 63e8ccbd..7bfdf3b 100644 --- a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_zero_vocab.json +++ b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_zero_vocab.json @@ -1,5 +1,5 @@ { - "entities": { + "Opcodes": { "ADD": [], "SUB": [], "MUL": [], @@ -8,5 +8,14 @@ "JMP": [], "CALL": [], "RET": [] + }, + "CommonOperands": { + "Immediate": [] + }, + "PhysicalRegisters": { + "GR32": [] + }, + "VirtualRegisters": { + "GR32": [] } }
\ No newline at end of file diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt index 6327cff..d3c0da9 100644 --- a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt +++ b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt @@ -6880,3 +6880,294 @@ Key: XSHA: [ 0.00 0.00 ] Key: XSTORE: [ 0.00 0.00 ] Key: XSUSLDTRK: [ 0.00 0.00 ] Key: XTEST: [ 0.00 0.00 ] +Key: Immediate: [ 0.10 0.10 ] +Key: CImmediate: [ 0.00 0.00 ] +Key: FPImmediate: [ 0.00 0.00 ] +Key: MBB: [ 0.20 0.20 ] +Key: FrameIndex: [ 0.30 0.30 ] +Key: ConstantPoolIndex: [ 0.00 0.00 ] +Key: TargetIndex: [ 0.00 0.00 ] +Key: JumpTableIndex: [ 0.00 0.00 ] +Key: ExternalSymbol: [ 0.00 0.00 ] +Key: GlobalAddress: [ 0.40 0.40 ] +Key: BlockAddress: [ 0.00 0.00 ] +Key: RegisterMask: [ 0.00 0.00 ] +Key: RegisterLiveOut: [ 0.00 0.00 ] +Key: Metadata: [ 0.00 0.00 ] +Key: MCSymbol: [ 0.00 0.00 ] +Key: CFIIndex: [ 0.00 0.00 ] +Key: IntrinsicID: [ 0.00 0.00 ] +Key: Predicate: [ 0.00 0.00 ] +Key: ShuffleMask: [ 0.00 0.00 ] +Key: PhyReg_GR8: [ 0.00 0.00 ] +Key: PhyReg_GRH8: [ 0.00 0.00 ] +Key: PhyReg_GR8_NOREX2: [ 0.00 0.00 ] +Key: PhyReg_GR8_NOREX: [ 0.00 0.00 ] +Key: PhyReg_GR8_ABCD_H: [ 0.00 0.00 ] +Key: PhyReg_GR8_ABCD_L: [ 0.00 0.00 ] +Key: PhyReg_GRH16: [ 0.00 0.00 ] +Key: PhyReg_GR16: [ 0.00 0.00 ] +Key: PhyReg_GR16_NOREX2: [ 0.00 0.00 ] +Key: PhyReg_GR16_NOREX: [ 0.00 0.00 ] +Key: PhyReg_VK1: [ 0.00 0.00 ] +Key: PhyReg_VK16: [ 0.00 0.00 ] +Key: PhyReg_VK2: [ 0.00 0.00 ] +Key: PhyReg_VK4: [ 0.00 0.00 ] +Key: PhyReg_VK8: [ 0.00 0.00 ] +Key: PhyReg_VK16WM: [ 0.00 0.00 ] +Key: PhyReg_VK1WM: [ 0.00 0.00 ] +Key: PhyReg_VK2WM: [ 0.00 0.00 ] +Key: PhyReg_VK4WM: [ 0.00 0.00 ] +Key: PhyReg_VK8WM: [ 0.00 0.00 ] +Key: PhyReg_SEGMENT_REG: [ 0.00 0.00 ] +Key: PhyReg_GR16_ABCD: [ 0.00 0.00 ] +Key: PhyReg_FPCCR: [ 0.00 0.00 ] +Key: PhyReg_FR16X: [ 0.00 0.00 ] +Key: PhyReg_FR16: [ 0.00 0.00 ] +Key: PhyReg_VK16PAIR: [ 0.00 0.00 ] +Key: PhyReg_VK1PAIR: [ 0.00 0.00 ] +Key: PhyReg_VK2PAIR: [ 0.00 0.00 ] +Key: PhyReg_VK4PAIR: [ 0.00 0.00 ] +Key: PhyReg_VK8PAIR: [ 0.00 0.00 ] +Key: PhyReg_VK1PAIR_with_sub_mask_0_in_VK1WM: [ 0.00 0.00 ] +Key: PhyReg_LOW32_ADDR_ACCESS_RBP: [ 0.00 0.00 ] +Key: PhyReg_LOW32_ADDR_ACCESS: [ 0.00 0.00 ] +Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit: [ 0.00 0.00 ] +Key: PhyReg_FR32X: [ 0.00 0.00 ] +Key: PhyReg_GR32: [ 0.50 0.50 ] +Key: PhyReg_GR32_NOSP: [ 0.00 0.00 ] +Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2: [ 0.00 0.00 ] +Key: PhyReg_DEBUG_REG: [ 0.00 0.00 ] +Key: PhyReg_FR32: [ 0.00 0.00 ] +Key: PhyReg_GR32_NOREX2: [ 0.00 0.00 ] +Key: PhyReg_GR32_NOREX2_NOSP: [ 0.00 0.00 ] +Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ] +Key: PhyReg_GR32_NOREX: [ 0.00 0.00 ] +Key: PhyReg_VK32: [ 0.00 0.00 ] +Key: PhyReg_GR32_NOREX_NOSP: [ 0.00 0.00 ] +Key: PhyReg_RFP32: [ 0.00 0.00 ] +Key: PhyReg_VK32WM: [ 0.00 0.00 ] +Key: PhyReg_GR32_ABCD: [ 0.00 0.00 ] +Key: PhyReg_GR32_TC: [ 0.00 0.00 ] +Key: PhyReg_GR32_ABCD_and_GR32_TC: [ 0.00 0.00 ] +Key: PhyReg_GR32_AD: [ 0.00 0.00 ] +Key: PhyReg_GR32_ArgRef: [ 0.00 0.00 ] +Key: PhyReg_GR32_BPSP: [ 0.00 0.00 ] +Key: PhyReg_GR32_BSI: [ 0.00 0.00 ] +Key: PhyReg_GR32_CB: [ 0.00 0.00 ] +Key: PhyReg_GR32_DC: [ 0.00 0.00 ] +Key: PhyReg_GR32_DIBP: [ 0.00 0.00 ] +Key: PhyReg_GR32_SIDI: [ 0.00 0.00 ] +Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit: [ 0.00 0.00 ] +Key: PhyReg_CCR: [ 0.00 0.00 ] +Key: PhyReg_DFCCR: [ 0.00 0.00 ] +Key: PhyReg_GR32_ABCD_and_GR32_BSI: [ 0.00 0.00 ] +Key: PhyReg_GR32_AD_and_GR32_ArgRef: [ 0.00 0.00 ] +Key: PhyReg_GR32_ArgRef_and_GR32_CB: [ 0.00 0.00 ] +Key: PhyReg_GR32_BPSP_and_GR32_DIBP: [ 0.00 0.00 ] +Key: PhyReg_GR32_BPSP_and_GR32_TC: [ 0.00 0.00 ] +Key: PhyReg_GR32_BSI_and_GR32_SIDI: [ 0.00 0.00 ] +Key: PhyReg_GR32_DIBP_and_GR32_SIDI: [ 0.00 0.00 ] +Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit: [ 0.00 0.00 ] +Key: PhyReg_LOW32_ADDR_ACCESS_with_sub_32bit: [ 0.00 0.00 ] +Key: PhyReg_RFP64: [ 0.00 0.00 ] +Key: PhyReg_GR64: [ 0.60 0.60 ] +Key: PhyReg_FR64X: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_8bit: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOSP: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOREX2: [ 0.00 0.00 ] +Key: PhyReg_CONTROL_REG: [ 0.00 0.00 ] +Key: PhyReg_FR64: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_16bit_in_GR16_NOREX2: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOREX2_NOSP: [ 0.00 0.00 ] +Key: PhyReg_GR64PLTSafe: [ 0.00 0.00 ] +Key: PhyReg_GR64_TC: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOREX: [ 0.00 0.00 ] +Key: PhyReg_GR64_TCW64: [ 0.00 0.00 ] +Key: PhyReg_GR64_TC_with_sub_8bit: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOREX2_NOSP_and_GR64_TC: [ 0.00 0.00 ] +Key: PhyReg_GR64_TCW64_with_sub_8bit: [ 0.00 0.00 ] +Key: PhyReg_GR64_TC_and_GR64_TCW64: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ] +Key: PhyReg_VK64: [ 0.00 0.00 ] +Key: PhyReg_VR64: [ 0.00 0.00 ] +Key: PhyReg_GR64PLTSafe_and_GR64_TC: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOREX2_NOSP_and_GR64_TCW64: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOREX_NOSP: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOREX_and_GR64_TC: [ 0.00 0.00 ] +Key: PhyReg_GR64_TCW64_and_GR64_TC_with_sub_8bit: [ 0.00 0.00 ] +Key: PhyReg_VK64WM: [ 0.00 0.00 ] +Key: PhyReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64: [ 0.00 0.00 ] +Key: PhyReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ] +Key: PhyReg_GR64PLTSafe_and_GR64_TCW64: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOREX_and_GR64_TCW64: [ 0.00 0.00 ] +Key: PhyReg_GR64_ABCD: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_TC: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC: [ 0.00 0.00 ] +Key: PhyReg_GR64_AD: [ 0.00 0.00 ] +Key: PhyReg_GR64_ArgRef: [ 0.00 0.00 ] +Key: PhyReg_GR64_and_LOW32_ADDR_ACCESS_RBP: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_BSI: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_CB: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_DIBP: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_SIDI: [ 0.00 0.00 ] +Key: PhyReg_GR64_A: [ 0.00 0.00 ] +Key: PhyReg_GR64_ArgRef_and_GR64_TC: [ 0.00 0.00 ] +Key: PhyReg_GR64_and_LOW32_ADDR_ACCESS: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI: [ 0.00 0.00 ] +Key: PhyReg_RST: [ 0.00 0.00 ] +Key: PhyReg_RFP80: [ 0.00 0.00 ] +Key: PhyReg_RFP80_7: [ 0.00 0.00 ] +Key: PhyReg_VR128X: [ 0.00 0.00 ] +Key: PhyReg_VR128: [ 0.00 0.00 ] +Key: PhyReg_VR256X: [ 0.00 0.00 ] +Key: PhyReg_VR256: [ 0.00 0.00 ] +Key: PhyReg_VR512: [ 0.00 0.00 ] +Key: PhyReg_VR512_0_15: [ 0.00 0.00 ] +Key: PhyReg_TILE: [ 0.00 0.00 ] +Key: PhyReg_TILEPAIR: [ 0.00 0.00 ] +Key: VirtReg_GR8: [ 0.00 0.00 ] +Key: VirtReg_GRH8: [ 0.00 0.00 ] +Key: VirtReg_GR8_NOREX2: [ 0.00 0.00 ] +Key: VirtReg_GR8_NOREX: [ 0.00 0.00 ] +Key: VirtReg_GR8_ABCD_H: [ 0.00 0.00 ] +Key: VirtReg_GR8_ABCD_L: [ 0.00 0.00 ] +Key: VirtReg_GRH16: [ 0.00 0.00 ] +Key: VirtReg_GR16: [ 0.00 0.00 ] +Key: VirtReg_GR16_NOREX2: [ 0.00 0.00 ] +Key: VirtReg_GR16_NOREX: [ 0.00 0.00 ] +Key: VirtReg_VK1: [ 0.00 0.00 ] +Key: VirtReg_VK16: [ 0.00 0.00 ] +Key: VirtReg_VK2: [ 0.00 0.00 ] +Key: VirtReg_VK4: [ 0.00 0.00 ] +Key: VirtReg_VK8: [ 0.00 0.00 ] +Key: VirtReg_VK16WM: [ 0.00 0.00 ] +Key: VirtReg_VK1WM: [ 0.00 0.00 ] +Key: VirtReg_VK2WM: [ 0.00 0.00 ] +Key: VirtReg_VK4WM: [ 0.00 0.00 ] +Key: VirtReg_VK8WM: [ 0.00 0.00 ] +Key: VirtReg_SEGMENT_REG: [ 0.00 0.00 ] +Key: VirtReg_GR16_ABCD: [ 0.00 0.00 ] +Key: VirtReg_FPCCR: [ 0.00 0.00 ] +Key: VirtReg_FR16X: [ 0.00 0.00 ] +Key: VirtReg_FR16: [ 0.00 0.00 ] +Key: VirtReg_VK16PAIR: [ 0.00 0.00 ] +Key: VirtReg_VK1PAIR: [ 0.00 0.00 ] +Key: VirtReg_VK2PAIR: [ 0.00 0.00 ] +Key: VirtReg_VK4PAIR: [ 0.00 0.00 ] +Key: VirtReg_VK8PAIR: [ 0.00 0.00 ] +Key: VirtReg_VK1PAIR_with_sub_mask_0_in_VK1WM: [ 0.00 0.00 ] +Key: VirtReg_LOW32_ADDR_ACCESS_RBP: [ 0.00 0.00 ] +Key: VirtReg_LOW32_ADDR_ACCESS: [ 0.00 0.00 ] +Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit: [ 0.00 0.00 ] +Key: VirtReg_FR32X: [ 0.00 0.00 ] +Key: VirtReg_GR32: [ 0.80 0.80 ] +Key: VirtReg_GR32_NOSP: [ 0.00 0.00 ] +Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2: [ 0.00 0.00 ] +Key: VirtReg_DEBUG_REG: [ 0.00 0.00 ] +Key: VirtReg_FR32: [ 0.00 0.00 ] +Key: VirtReg_GR32_NOREX2: [ 0.00 0.00 ] +Key: VirtReg_GR32_NOREX2_NOSP: [ 0.00 0.00 ] +Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ] +Key: VirtReg_GR32_NOREX: [ 0.00 0.00 ] +Key: VirtReg_VK32: [ 0.00 0.00 ] +Key: VirtReg_GR32_NOREX_NOSP: [ 0.00 0.00 ] +Key: VirtReg_RFP32: [ 0.00 0.00 ] +Key: VirtReg_VK32WM: [ 0.00 0.00 ] +Key: VirtReg_GR32_ABCD: [ 0.00 0.00 ] +Key: VirtReg_GR32_TC: [ 0.00 0.00 ] +Key: VirtReg_GR32_ABCD_and_GR32_TC: [ 0.00 0.00 ] +Key: VirtReg_GR32_AD: [ 0.00 0.00 ] +Key: VirtReg_GR32_ArgRef: [ 0.00 0.00 ] +Key: VirtReg_GR32_BPSP: [ 0.00 0.00 ] +Key: VirtReg_GR32_BSI: [ 0.00 0.00 ] +Key: VirtReg_GR32_CB: [ 0.00 0.00 ] +Key: VirtReg_GR32_DC: [ 0.00 0.00 ] +Key: VirtReg_GR32_DIBP: [ 0.00 0.00 ] +Key: VirtReg_GR32_SIDI: [ 0.00 0.00 ] +Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit: [ 0.00 0.00 ] +Key: VirtReg_CCR: [ 0.00 0.00 ] +Key: VirtReg_DFCCR: [ 0.00 0.00 ] +Key: VirtReg_GR32_ABCD_and_GR32_BSI: [ 0.00 0.00 ] +Key: VirtReg_GR32_AD_and_GR32_ArgRef: [ 0.00 0.00 ] +Key: VirtReg_GR32_ArgRef_and_GR32_CB: [ 0.00 0.00 ] +Key: VirtReg_GR32_BPSP_and_GR32_DIBP: [ 0.00 0.00 ] +Key: VirtReg_GR32_BPSP_and_GR32_TC: [ 0.00 0.00 ] +Key: VirtReg_GR32_BSI_and_GR32_SIDI: [ 0.00 0.00 ] +Key: VirtReg_GR32_DIBP_and_GR32_SIDI: [ 0.00 0.00 ] +Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit: [ 0.00 0.00 ] +Key: VirtReg_LOW32_ADDR_ACCESS_with_sub_32bit: [ 0.00 0.00 ] +Key: VirtReg_RFP64: [ 0.00 0.00 ] +Key: VirtReg_GR64: [ 0.90 0.90 ] +Key: VirtReg_FR64X: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_8bit: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOSP: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOREX2: [ 0.00 0.00 ] +Key: VirtReg_CONTROL_REG: [ 0.00 0.00 ] +Key: VirtReg_FR64: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_16bit_in_GR16_NOREX2: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOREX2_NOSP: [ 0.00 0.00 ] +Key: VirtReg_GR64PLTSafe: [ 0.00 0.00 ] +Key: VirtReg_GR64_TC: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOREX: [ 0.00 0.00 ] +Key: VirtReg_GR64_TCW64: [ 0.00 0.00 ] +Key: VirtReg_GR64_TC_with_sub_8bit: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOREX2_NOSP_and_GR64_TC: [ 0.00 0.00 ] +Key: VirtReg_GR64_TCW64_with_sub_8bit: [ 0.00 0.00 ] +Key: VirtReg_GR64_TC_and_GR64_TCW64: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ] +Key: VirtReg_VK64: [ 0.00 0.00 ] +Key: VirtReg_VR64: [ 0.00 0.00 ] +Key: VirtReg_GR64PLTSafe_and_GR64_TC: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOREX2_NOSP_and_GR64_TCW64: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOREX_NOSP: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOREX_and_GR64_TC: [ 0.00 0.00 ] +Key: VirtReg_GR64_TCW64_and_GR64_TC_with_sub_8bit: [ 0.00 0.00 ] +Key: VirtReg_VK64WM: [ 0.00 0.00 ] +Key: VirtReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64: [ 0.00 0.00 ] +Key: VirtReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ] +Key: VirtReg_GR64PLTSafe_and_GR64_TCW64: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOREX_and_GR64_TCW64: [ 0.00 0.00 ] +Key: VirtReg_GR64_ABCD: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_TC: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC: [ 0.00 0.00 ] +Key: VirtReg_GR64_AD: [ 0.00 0.00 ] +Key: VirtReg_GR64_ArgRef: [ 0.00 0.00 ] +Key: VirtReg_GR64_and_LOW32_ADDR_ACCESS_RBP: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_BSI: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_CB: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_DIBP: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_SIDI: [ 0.00 0.00 ] +Key: VirtReg_GR64_A: [ 0.00 0.00 ] +Key: VirtReg_GR64_ArgRef_and_GR64_TC: [ 0.00 0.00 ] +Key: VirtReg_GR64_and_LOW32_ADDR_ACCESS: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI: [ 0.00 0.00 ] +Key: VirtReg_RST: [ 0.00 0.00 ] +Key: VirtReg_RFP80: [ 0.00 0.00 ] +Key: VirtReg_RFP80_7: [ 0.00 0.00 ] +Key: VirtReg_VR128X: [ 0.00 0.00 ] +Key: VirtReg_VR128: [ 0.00 0.00 ] +Key: VirtReg_VR256X: [ 0.00 0.00 ] +Key: VirtReg_VR256: [ 0.00 0.00 ] +Key: VirtReg_VR512: [ 0.00 0.00 ] +Key: VirtReg_VR512_0_15: [ 0.00 0.00 ] +Key: VirtReg_TILE: [ 0.00 0.00 ] +Key: VirtReg_TILEPAIR: [ 0.00 0.00 ] diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt index 4409e6d..c6e5508 100644 --- a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt +++ b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt @@ -6880,3 +6880,294 @@ Key: XSHA: [ 0.00 0.00 ] Key: XSTORE: [ 0.00 0.00 ] Key: XSUSLDTRK: [ 0.00 0.00 ] Key: XTEST: [ 0.00 0.00 ] +Key: Immediate: [ 0.10 0.10 ] +Key: CImmediate: [ 0.00 0.00 ] +Key: FPImmediate: [ 0.00 0.00 ] +Key: MBB: [ 0.20 0.20 ] +Key: FrameIndex: [ 0.30 0.30 ] +Key: ConstantPoolIndex: [ 0.00 0.00 ] +Key: TargetIndex: [ 0.00 0.00 ] +Key: JumpTableIndex: [ 0.00 0.00 ] +Key: ExternalSymbol: [ 0.00 0.00 ] +Key: GlobalAddress: [ 0.40 0.40 ] +Key: BlockAddress: [ 0.00 0.00 ] +Key: RegisterMask: [ 0.00 0.00 ] +Key: RegisterLiveOut: [ 0.00 0.00 ] +Key: Metadata: [ 0.00 0.00 ] +Key: MCSymbol: [ 0.00 0.00 ] +Key: CFIIndex: [ 0.00 0.00 ] +Key: IntrinsicID: [ 0.00 0.00 ] +Key: Predicate: [ 0.00 0.00 ] +Key: ShuffleMask: [ 0.00 0.00 ] +Key: PhyReg_GR8: [ 0.00 0.00 ] +Key: PhyReg_GRH8: [ 0.00 0.00 ] +Key: PhyReg_GR8_NOREX2: [ 0.00 0.00 ] +Key: PhyReg_GR8_NOREX: [ 0.00 0.00 ] +Key: PhyReg_GR8_ABCD_H: [ 0.00 0.00 ] +Key: PhyReg_GR8_ABCD_L: [ 0.00 0.00 ] +Key: PhyReg_GRH16: [ 0.00 0.00 ] +Key: PhyReg_GR16: [ 0.00 0.00 ] +Key: PhyReg_GR16_NOREX2: [ 0.00 0.00 ] +Key: PhyReg_GR16_NOREX: [ 0.00 0.00 ] +Key: PhyReg_VK1: [ 0.00 0.00 ] +Key: PhyReg_VK16: [ 0.00 0.00 ] +Key: PhyReg_VK2: [ 0.00 0.00 ] +Key: PhyReg_VK4: [ 0.00 0.00 ] +Key: PhyReg_VK8: [ 0.00 0.00 ] +Key: PhyReg_VK16WM: [ 0.00 0.00 ] +Key: PhyReg_VK1WM: [ 0.00 0.00 ] +Key: PhyReg_VK2WM: [ 0.00 0.00 ] +Key: PhyReg_VK4WM: [ 0.00 0.00 ] +Key: PhyReg_VK8WM: [ 0.00 0.00 ] +Key: PhyReg_SEGMENT_REG: [ 0.00 0.00 ] +Key: PhyReg_GR16_ABCD: [ 0.00 0.00 ] +Key: PhyReg_FPCCR: [ 0.00 0.00 ] +Key: PhyReg_FR16X: [ 0.00 0.00 ] +Key: PhyReg_FR16: [ 0.00 0.00 ] +Key: PhyReg_VK16PAIR: [ 0.00 0.00 ] +Key: PhyReg_VK1PAIR: [ 0.00 0.00 ] +Key: PhyReg_VK2PAIR: [ 0.00 0.00 ] +Key: PhyReg_VK4PAIR: [ 0.00 0.00 ] +Key: PhyReg_VK8PAIR: [ 0.00 0.00 ] +Key: PhyReg_VK1PAIR_with_sub_mask_0_in_VK1WM: [ 0.00 0.00 ] +Key: PhyReg_LOW32_ADDR_ACCESS_RBP: [ 0.00 0.00 ] +Key: PhyReg_LOW32_ADDR_ACCESS: [ 0.00 0.00 ] +Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit: [ 0.00 0.00 ] +Key: PhyReg_FR32X: [ 0.00 0.00 ] +Key: PhyReg_GR32: [ 0.50 0.50 ] +Key: PhyReg_GR32_NOSP: [ 0.00 0.00 ] +Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2: [ 0.00 0.00 ] +Key: PhyReg_DEBUG_REG: [ 0.00 0.00 ] +Key: PhyReg_FR32: [ 0.00 0.00 ] +Key: PhyReg_GR32_NOREX2: [ 0.00 0.00 ] +Key: PhyReg_GR32_NOREX2_NOSP: [ 0.00 0.00 ] +Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ] +Key: PhyReg_GR32_NOREX: [ 0.00 0.00 ] +Key: PhyReg_VK32: [ 0.00 0.00 ] +Key: PhyReg_GR32_NOREX_NOSP: [ 0.00 0.00 ] +Key: PhyReg_RFP32: [ 0.00 0.00 ] +Key: PhyReg_VK32WM: [ 0.00 0.00 ] +Key: PhyReg_GR32_ABCD: [ 0.00 0.00 ] +Key: PhyReg_GR32_TC: [ 0.00 0.00 ] +Key: PhyReg_GR32_ABCD_and_GR32_TC: [ 0.00 0.00 ] +Key: PhyReg_GR32_AD: [ 0.00 0.00 ] +Key: PhyReg_GR32_ArgRef: [ 0.00 0.00 ] +Key: PhyReg_GR32_BPSP: [ 0.00 0.00 ] +Key: PhyReg_GR32_BSI: [ 0.00 0.00 ] +Key: PhyReg_GR32_CB: [ 0.00 0.00 ] +Key: PhyReg_GR32_DC: [ 0.00 0.00 ] +Key: PhyReg_GR32_DIBP: [ 0.00 0.00 ] +Key: PhyReg_GR32_SIDI: [ 0.00 0.00 ] +Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit: [ 0.00 0.00 ] +Key: PhyReg_CCR: [ 0.00 0.00 ] +Key: PhyReg_DFCCR: [ 0.00 0.00 ] +Key: PhyReg_GR32_ABCD_and_GR32_BSI: [ 0.00 0.00 ] +Key: PhyReg_GR32_AD_and_GR32_ArgRef: [ 0.00 0.00 ] +Key: PhyReg_GR32_ArgRef_and_GR32_CB: [ 0.00 0.00 ] +Key: PhyReg_GR32_BPSP_and_GR32_DIBP: [ 0.00 0.00 ] +Key: PhyReg_GR32_BPSP_and_GR32_TC: [ 0.00 0.00 ] +Key: PhyReg_GR32_BSI_and_GR32_SIDI: [ 0.00 0.00 ] +Key: PhyReg_GR32_DIBP_and_GR32_SIDI: [ 0.00 0.00 ] +Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit: [ 0.00 0.00 ] +Key: PhyReg_LOW32_ADDR_ACCESS_with_sub_32bit: [ 0.00 0.00 ] +Key: PhyReg_RFP64: [ 0.00 0.00 ] +Key: PhyReg_GR64: [ 0.60 0.60 ] +Key: PhyReg_FR64X: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_8bit: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOSP: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOREX2: [ 0.00 0.00 ] +Key: PhyReg_CONTROL_REG: [ 0.00 0.00 ] +Key: PhyReg_FR64: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_16bit_in_GR16_NOREX2: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOREX2_NOSP: [ 0.00 0.00 ] +Key: PhyReg_GR64PLTSafe: [ 0.00 0.00 ] +Key: PhyReg_GR64_TC: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOREX: [ 0.00 0.00 ] +Key: PhyReg_GR64_TCW64: [ 0.00 0.00 ] +Key: PhyReg_GR64_TC_with_sub_8bit: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOREX2_NOSP_and_GR64_TC: [ 0.00 0.00 ] +Key: PhyReg_GR64_TCW64_with_sub_8bit: [ 0.00 0.00 ] +Key: PhyReg_GR64_TC_and_GR64_TCW64: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ] +Key: PhyReg_VK64: [ 0.00 0.00 ] +Key: PhyReg_VR64: [ 0.00 0.00 ] +Key: PhyReg_GR64PLTSafe_and_GR64_TC: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOREX2_NOSP_and_GR64_TCW64: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOREX_NOSP: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOREX_and_GR64_TC: [ 0.00 0.00 ] +Key: PhyReg_GR64_TCW64_and_GR64_TC_with_sub_8bit: [ 0.00 0.00 ] +Key: PhyReg_VK64WM: [ 0.00 0.00 ] +Key: PhyReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64: [ 0.00 0.00 ] +Key: PhyReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ] +Key: PhyReg_GR64PLTSafe_and_GR64_TCW64: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOREX_and_GR64_TCW64: [ 0.00 0.00 ] +Key: PhyReg_GR64_ABCD: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_TC: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC: [ 0.00 0.00 ] +Key: PhyReg_GR64_AD: [ 0.00 0.00 ] +Key: PhyReg_GR64_ArgRef: [ 0.00 0.00 ] +Key: PhyReg_GR64_and_LOW32_ADDR_ACCESS_RBP: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_BSI: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_CB: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_DIBP: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_SIDI: [ 0.00 0.00 ] +Key: PhyReg_GR64_A: [ 0.00 0.00 ] +Key: PhyReg_GR64_ArgRef_and_GR64_TC: [ 0.00 0.00 ] +Key: PhyReg_GR64_and_LOW32_ADDR_ACCESS: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI: [ 0.00 0.00 ] +Key: PhyReg_RST: [ 0.00 0.00 ] +Key: PhyReg_RFP80: [ 0.00 0.00 ] +Key: PhyReg_RFP80_7: [ 0.00 0.00 ] +Key: PhyReg_VR128X: [ 0.00 0.00 ] +Key: PhyReg_VR128: [ 0.00 0.00 ] +Key: PhyReg_VR256X: [ 0.00 0.00 ] +Key: PhyReg_VR256: [ 0.00 0.00 ] +Key: PhyReg_VR512: [ 0.00 0.00 ] +Key: PhyReg_VR512_0_15: [ 0.00 0.00 ] +Key: PhyReg_TILE: [ 0.00 0.00 ] +Key: PhyReg_TILEPAIR: [ 0.00 0.00 ] +Key: VirtReg_GR8: [ 0.00 0.00 ] +Key: VirtReg_GRH8: [ 0.00 0.00 ] +Key: VirtReg_GR8_NOREX2: [ 0.00 0.00 ] +Key: VirtReg_GR8_NOREX: [ 0.00 0.00 ] +Key: VirtReg_GR8_ABCD_H: [ 0.00 0.00 ] +Key: VirtReg_GR8_ABCD_L: [ 0.00 0.00 ] +Key: VirtReg_GRH16: [ 0.00 0.00 ] +Key: VirtReg_GR16: [ 0.00 0.00 ] +Key: VirtReg_GR16_NOREX2: [ 0.00 0.00 ] +Key: VirtReg_GR16_NOREX: [ 0.00 0.00 ] +Key: VirtReg_VK1: [ 0.00 0.00 ] +Key: VirtReg_VK16: [ 0.00 0.00 ] +Key: VirtReg_VK2: [ 0.00 0.00 ] +Key: VirtReg_VK4: [ 0.00 0.00 ] +Key: VirtReg_VK8: [ 0.00 0.00 ] +Key: VirtReg_VK16WM: [ 0.00 0.00 ] +Key: VirtReg_VK1WM: [ 0.00 0.00 ] +Key: VirtReg_VK2WM: [ 0.00 0.00 ] +Key: VirtReg_VK4WM: [ 0.00 0.00 ] +Key: VirtReg_VK8WM: [ 0.00 0.00 ] +Key: VirtReg_SEGMENT_REG: [ 0.00 0.00 ] +Key: VirtReg_GR16_ABCD: [ 0.00 0.00 ] +Key: VirtReg_FPCCR: [ 0.00 0.00 ] +Key: VirtReg_FR16X: [ 0.00 0.00 ] +Key: VirtReg_FR16: [ 0.00 0.00 ] +Key: VirtReg_VK16PAIR: [ 0.00 0.00 ] +Key: VirtReg_VK1PAIR: [ 0.00 0.00 ] +Key: VirtReg_VK2PAIR: [ 0.00 0.00 ] +Key: VirtReg_VK4PAIR: [ 0.00 0.00 ] +Key: VirtReg_VK8PAIR: [ 0.00 0.00 ] +Key: VirtReg_VK1PAIR_with_sub_mask_0_in_VK1WM: [ 0.00 0.00 ] +Key: VirtReg_LOW32_ADDR_ACCESS_RBP: [ 0.00 0.00 ] +Key: VirtReg_LOW32_ADDR_ACCESS: [ 0.00 0.00 ] +Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit: [ 0.00 0.00 ] +Key: VirtReg_FR32X: [ 0.00 0.00 ] +Key: VirtReg_GR32: [ 0.80 0.80 ] +Key: VirtReg_GR32_NOSP: [ 0.00 0.00 ] +Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2: [ 0.00 0.00 ] +Key: VirtReg_DEBUG_REG: [ 0.00 0.00 ] +Key: VirtReg_FR32: [ 0.00 0.00 ] +Key: VirtReg_GR32_NOREX2: [ 0.00 0.00 ] +Key: VirtReg_GR32_NOREX2_NOSP: [ 0.00 0.00 ] +Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ] +Key: VirtReg_GR32_NOREX: [ 0.00 0.00 ] +Key: VirtReg_VK32: [ 0.00 0.00 ] +Key: VirtReg_GR32_NOREX_NOSP: [ 0.00 0.00 ] +Key: VirtReg_RFP32: [ 0.00 0.00 ] +Key: VirtReg_VK32WM: [ 0.00 0.00 ] +Key: VirtReg_GR32_ABCD: [ 0.00 0.00 ] +Key: VirtReg_GR32_TC: [ 0.00 0.00 ] +Key: VirtReg_GR32_ABCD_and_GR32_TC: [ 0.00 0.00 ] +Key: VirtReg_GR32_AD: [ 0.00 0.00 ] +Key: VirtReg_GR32_ArgRef: [ 0.00 0.00 ] +Key: VirtReg_GR32_BPSP: [ 0.00 0.00 ] +Key: VirtReg_GR32_BSI: [ 0.00 0.00 ] +Key: VirtReg_GR32_CB: [ 0.00 0.00 ] +Key: VirtReg_GR32_DC: [ 0.00 0.00 ] +Key: VirtReg_GR32_DIBP: [ 0.00 0.00 ] +Key: VirtReg_GR32_SIDI: [ 0.00 0.00 ] +Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit: [ 0.00 0.00 ] +Key: VirtReg_CCR: [ 0.00 0.00 ] +Key: VirtReg_DFCCR: [ 0.00 0.00 ] +Key: VirtReg_GR32_ABCD_and_GR32_BSI: [ 0.00 0.00 ] +Key: VirtReg_GR32_AD_and_GR32_ArgRef: [ 0.00 0.00 ] +Key: VirtReg_GR32_ArgRef_and_GR32_CB: [ 0.00 0.00 ] +Key: VirtReg_GR32_BPSP_and_GR32_DIBP: [ 0.00 0.00 ] +Key: VirtReg_GR32_BPSP_and_GR32_TC: [ 0.00 0.00 ] +Key: VirtReg_GR32_BSI_and_GR32_SIDI: [ 0.00 0.00 ] +Key: VirtReg_GR32_DIBP_and_GR32_SIDI: [ 0.00 0.00 ] +Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit: [ 0.00 0.00 ] +Key: VirtReg_LOW32_ADDR_ACCESS_with_sub_32bit: [ 0.00 0.00 ] +Key: VirtReg_RFP64: [ 0.00 0.00 ] +Key: VirtReg_GR64: [ 0.90 0.90 ] +Key: VirtReg_FR64X: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_8bit: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOSP: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOREX2: [ 0.00 0.00 ] +Key: VirtReg_CONTROL_REG: [ 0.00 0.00 ] +Key: VirtReg_FR64: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_16bit_in_GR16_NOREX2: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOREX2_NOSP: [ 0.00 0.00 ] +Key: VirtReg_GR64PLTSafe: [ 0.00 0.00 ] +Key: VirtReg_GR64_TC: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOREX: [ 0.00 0.00 ] +Key: VirtReg_GR64_TCW64: [ 0.00 0.00 ] +Key: VirtReg_GR64_TC_with_sub_8bit: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOREX2_NOSP_and_GR64_TC: [ 0.00 0.00 ] +Key: VirtReg_GR64_TCW64_with_sub_8bit: [ 0.00 0.00 ] +Key: VirtReg_GR64_TC_and_GR64_TCW64: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ] +Key: VirtReg_VK64: [ 0.00 0.00 ] +Key: VirtReg_VR64: [ 0.00 0.00 ] +Key: VirtReg_GR64PLTSafe_and_GR64_TC: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOREX2_NOSP_and_GR64_TCW64: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOREX_NOSP: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOREX_and_GR64_TC: [ 0.00 0.00 ] +Key: VirtReg_GR64_TCW64_and_GR64_TC_with_sub_8bit: [ 0.00 0.00 ] +Key: VirtReg_VK64WM: [ 0.00 0.00 ] +Key: VirtReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64: [ 0.00 0.00 ] +Key: VirtReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ] +Key: VirtReg_GR64PLTSafe_and_GR64_TCW64: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOREX_and_GR64_TCW64: [ 0.00 0.00 ] +Key: VirtReg_GR64_ABCD: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_TC: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC: [ 0.00 0.00 ] +Key: VirtReg_GR64_AD: [ 0.00 0.00 ] +Key: VirtReg_GR64_ArgRef: [ 0.00 0.00 ] +Key: VirtReg_GR64_and_LOW32_ADDR_ACCESS_RBP: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_BSI: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_CB: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_DIBP: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_SIDI: [ 0.00 0.00 ] +Key: VirtReg_GR64_A: [ 0.00 0.00 ] +Key: VirtReg_GR64_ArgRef_and_GR64_TC: [ 0.00 0.00 ] +Key: VirtReg_GR64_and_LOW32_ADDR_ACCESS: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI: [ 0.00 0.00 ] +Key: VirtReg_RST: [ 0.00 0.00 ] +Key: VirtReg_RFP80: [ 0.00 0.00 ] +Key: VirtReg_RFP80_7: [ 0.00 0.00 ] +Key: VirtReg_VR128X: [ 0.00 0.00 ] +Key: VirtReg_VR128: [ 0.00 0.00 ] +Key: VirtReg_VR256X: [ 0.00 0.00 ] +Key: VirtReg_VR256: [ 0.00 0.00 ] +Key: VirtReg_VR512: [ 0.00 0.00 ] +Key: VirtReg_VR512_0_15: [ 0.00 0.00 ] +Key: VirtReg_TILE: [ 0.00 0.00 ] +Key: VirtReg_TILEPAIR: [ 0.00 0.00 ] diff --git a/llvm/test/CodeGen/MIR2Vec/if-else.mir b/llvm/test/CodeGen/MIR2Vec/if-else.mir index 5734a23..f2572f5 100644 --- a/llvm/test/CodeGen/MIR2Vec/if-else.mir +++ b/llvm/test/CodeGen/MIR2Vec/if-else.mir @@ -135,10 +135,10 @@ body: | # CHECK: Machine basic block vectors: # CHECK-NEXT: Machine basic block: abc:entry: -# CHECK-NEXT: [ 16.50 17.10 17.70 ] +# CHECK-NEXT: [ 23.60 24.20 24.80 ] # CHECK-NEXT: Machine basic block: abc:if.then: -# CHECK-NEXT: [ 4.50 4.80 5.10 ] +# CHECK-NEXT: [ 7.30 7.60 7.90 ] # CHECK-NEXT: Machine basic block: abc:if.else: -# CHECK-NEXT: [ 0.80 1.00 1.20 ] +# CHECK-NEXT: [ 3.40 3.60 3.80 ] # CHECK-NEXT: Machine basic block: abc:return: -# CHECK-NEXT: [ 6.60 6.90 7.20 ]
\ No newline at end of file +# CHECK-NEXT: [ 8.80 9.10 9.40 ] diff --git a/llvm/test/CodeGen/MIR2Vec/mir2vec-basic-symbolic.mir b/llvm/test/CodeGen/MIR2Vec/mir2vec-basic-symbolic.mir index 338cb63..0fdcc81 100644 --- a/llvm/test/CodeGen/MIR2Vec/mir2vec-basic-symbolic.mir +++ b/llvm/test/CodeGen/MIR2Vec/mir2vec-basic-symbolic.mir @@ -48,29 +48,29 @@ body: | RET 0 # CHECK: MIR2Vec embeddings for machine function add_function: -# CHECK: Function vector: [ 19.20 19.80 20.40 ] +# CHECK: Function vector: [ 26.50 27.10 27.70 ] # CHECK-NEXT: Machine basic block vectors: # CHECK-NEXT: Machine basic block: add_function:entry: -# CHECK-NEXT: [ 19.20 19.80 20.40 ] +# CHECK-NEXT: [ 26.50 27.10 27.70 ] # CHECK-NEXT: Machine instruction vectors: # CHECK-NEXT: Machine instruction: %1:gr32 = COPY $esi -# CHECK-NEXT: [ 5.20 5.30 5.40 ] +# CHECK-NEXT: [ 6.00 6.10 6.20 ] # CHECK-NEXT: Machine instruction: %0:gr32 = COPY $edi -# CHECK-NEXT: [ 5.20 5.30 5.40 ] +# CHECK-NEXT: [ 6.00 6.10 6.20 ] # CHECK-NEXT: Machine instruction: %2:gr32 = nsw ADD32rr %0:gr32(tied-def 0), %1:gr32, implicit-def dead $eflags -# CHECK-NEXT: [ 1.30 1.40 1.50 ] +# CHECK-NEXT: [ 3.70 3.80 3.90 ] # CHECK-NEXT: Machine instruction: %3:gr32 = ADD32rr %2:gr32(tied-def 0), %2:gr32, implicit-def dead $eflags -# CHECK-NEXT: [ 1.30 1.40 1.50 ] +# CHECK-NEXT: [ 3.70 3.80 3.90 ] # CHECK-NEXT: Machine instruction: $eax = COPY %3:gr32 -# CHECK-NEXT: [ 5.20 5.30 5.40 ] +# CHECK-NEXT: [ 6.00 6.10 6.20 ] # CHECK-NEXT: Machine instruction: RET 0, $eax -# CHECK-NEXT: [ 1.00 1.10 1.20 ] +# CHECK-NEXT: [ 1.10 1.20 1.30 ] # CHECK: MIR2Vec embeddings for machine function simple_function: -# CHECK-NEXT:Function vector: [ 1.00 1.10 1.20 ] +# CHECK-NEXT:Function vector: [ 1.10 1.20 1.30 ] # CHECK-NEXT: Machine basic block vectors: # CHECK-NEXT: Machine basic block: simple_function:entry: -# CHECK-NEXT: [ 1.00 1.10 1.20 ] +# CHECK-NEXT: [ 1.10 1.20 1.30 ] # CHECK-NEXT: Machine instruction vectors: # CHECK-NEXT: Machine instruction: RET 0 -# CHECK-NEXT: [ 1.00 1.10 1.20 ]
\ No newline at end of file +# CHECK-NEXT: [ 1.10 1.20 1.30 ] diff --git a/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll b/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll index c6554bc..13e908e 100644 --- a/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll +++ b/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll @@ -10,6 +10,6 @@ define dso_local void @test() { } ; CHECK-INVALID: MIR2Vec Vocabulary Printer: Failed to get vocabulary - MIR2Vec vocabulary file path not specified; set it using --mir2vec-vocab-path -; CHECK-ZERO-DIM: MIR2Vec Vocabulary Printer: Failed to get vocabulary - Dimension of 'entities' section of the vocabulary is zero -; CHECK-NO-ENTITIES: MIR2Vec Vocabulary Printer: Failed to get vocabulary - Missing 'entities' section in vocabulary file -; CHECK-INCONSISTENT-DIMS: MIR2Vec Vocabulary Printer: Failed to get vocabulary - All vectors in the 'entities' section of the vocabulary are not of the same dimension +; CHECK-ZERO-DIM: MIR2Vec Vocabulary Printer: Failed to get vocabulary - Dimension of 'Opcodes' section of the vocabulary is zero +; CHECK-NO-ENTITIES: MIR2Vec Vocabulary Printer: Failed to get vocabulary - Missing 'Opcodes' section in vocabulary file +; CHECK-INCONSISTENT-DIMS: MIR2Vec Vocabulary Printer: Failed to get vocabulary - All vectors in the 'Opcodes' section of the vocabulary are not of the same dimension diff --git a/llvm/test/CodeGen/NVPTX/fma-assoc.ll b/llvm/test/CodeGen/NVPTX/fma-assoc.ll index 6693c90..db0eae7 100644 --- a/llvm/test/CodeGen/NVPTX/fma-assoc.ll +++ b/llvm/test/CodeGen/NVPTX/fma-assoc.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s -check-prefix=CHECK -; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast -enable-unsafe-fp-math | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-UNSAFE +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-UNSAFE +; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast | %ptxas-verify %} -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast -enable-unsafe-fp-math | %ptxas-verify %} define ptx_device float @t1_f32(float %x, float %y, float %z, ; CHECK-UNSAFE-LABEL: t1_f32( diff --git a/llvm/test/CodeGen/PowerPC/fmf-propagation.ll b/llvm/test/CodeGen/PowerPC/fmf-propagation.ll index cad684e..baa127e 100644 --- a/llvm/test/CodeGen/PowerPC/fmf-propagation.ll +++ b/llvm/test/CodeGen/PowerPC/fmf-propagation.ll @@ -2,8 +2,8 @@ ; REQUIRES: asserts ; RUN: llc < %s -mtriple=powerpc64le -debug-only=isel -o /dev/null 2>&1 | FileCheck %s --check-prefix=FMFDEBUG ; RUN: llc < %s -mtriple=powerpc64le | FileCheck %s --check-prefix=FMF -; RUN: llc < %s -mtriple=powerpc64le -debug-only=isel -o /dev/null 2>&1 -enable-unsafe-fp-math -fp-contract=fast -enable-no-nans-fp-math | FileCheck %s --check-prefix=GLOBALDEBUG -; RUN: llc < %s -mtriple=powerpc64le -enable-unsafe-fp-math -fp-contract=fast -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math | FileCheck %s --check-prefix=GLOBAL +; RUN: llc < %s -mtriple=powerpc64le -debug-only=isel -o /dev/null 2>&1 -fp-contract=fast -enable-no-nans-fp-math | FileCheck %s --check-prefix=GLOBALDEBUG +; RUN: llc < %s -mtriple=powerpc64le -fp-contract=fast -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math | FileCheck %s --check-prefix=GLOBAL ; Test FP transforms using instruction/node-level fast-math-flags. ; We're also checking debug output to verify that FMF is propagated to the newly created nodes. diff --git a/llvm/test/CodeGen/PowerPC/scalar-equal.ll b/llvm/test/CodeGen/PowerPC/scalar-equal.ll index 1832475..c0b11b4 100644 --- a/llvm/test/CodeGen/PowerPC/scalar-equal.ll +++ b/llvm/test/CodeGen/PowerPC/scalar-equal.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names --enable-unsafe-fp-math \ +; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names \ ; RUN: -verify-machineinstrs --enable-no-signed-zeros-fp-math \ ; RUN: --enable-no-nans-fp-math --enable-no-infs-fp-math \ ; RUN: -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s \ ; RUN: --check-prefix=FAST-P8 -; RUN: llc -mcpu=pwr9 -ppc-asm-full-reg-names --enable-unsafe-fp-math \ +; RUN: llc -mcpu=pwr9 -ppc-asm-full-reg-names \ ; RUN: -verify-machineinstrs --enable-no-signed-zeros-fp-math \ ; RUN: --enable-no-nans-fp-math --enable-no-infs-fp-math \ ; RUN: -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s \ diff --git a/llvm/test/CodeGen/PowerPC/scalar-min-max-p10.ll b/llvm/test/CodeGen/PowerPC/scalar-min-max-p10.ll index ca9baceb..5915bd3 100644 --- a/llvm/test/CodeGen/PowerPC/scalar-min-max-p10.ll +++ b/llvm/test/CodeGen/PowerPC/scalar-min-max-p10.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mcpu=pwr10 -ppc-asm-full-reg-names --enable-unsafe-fp-math \ +; RUN: llc -mcpu=pwr10 -ppc-asm-full-reg-names \ ; RUN: -verify-machineinstrs --enable-no-signed-zeros-fp-math \ ; RUN: --enable-no-nans-fp-math \ ; RUN: -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s diff --git a/llvm/test/CodeGen/PowerPC/scalar_cmp.ll b/llvm/test/CodeGen/PowerPC/scalar_cmp.ll index fd0b494..881d1f4 100644 --- a/llvm/test/CodeGen/PowerPC/scalar_cmp.ll +++ b/llvm/test/CodeGen/PowerPC/scalar_cmp.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names --enable-unsafe-fp-math \ +; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names \ ; RUN: -verify-machineinstrs --enable-no-signed-zeros-fp-math \ ; RUN: --enable-no-nans-fp-math --enable-no-infs-fp-math \ ; RUN: -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s \ ; RUN: --check-prefix=FAST-P8 -; RUN: llc -mcpu=pwr9 -ppc-asm-full-reg-names --enable-unsafe-fp-math \ +; RUN: llc -mcpu=pwr9 -ppc-asm-full-reg-names \ ; RUN: -verify-machineinstrs --enable-no-signed-zeros-fp-math \ ; RUN: --enable-no-nans-fp-math --enable-no-infs-fp-math \ ; RUN: -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s \ diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll index 693a40d..5e5f2b7 100644 --- a/llvm/test/CodeGen/RISCV/features-info.ll +++ b/llvm/test/CodeGen/RISCV/features-info.ll @@ -217,6 +217,11 @@ ; CHECK-NEXT: xsfmm64t - 'XSfmm64t' (TE=64 configuration). ; CHECK-NEXT: xsfmmbase - 'XSfmmbase' (All non arithmetic instructions for all TEWs and sf.vtzero). ; CHECK-NEXT: xsfvcp - 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions). +; CHECK-NEXT: xsfvfbfexp16e - 'XSfvfbfexp16e' (SiFive Vector Floating-Point Exponential Function Instruction, BFloat16). +; CHECK-NEXT: xsfvfexp16e - 'XSfvfexp16e' (SiFive Vector Floating-Point Exponential Function Instruction, Half Precision). +; CHECK-NEXT: xsfvfexp32e - 'XSfvfexp32e' (SiFive Vector Floating-Point Exponential Function Instruction, Single Precision). +; CHECK-NEXT: xsfvfexpa - 'XSfvfexpa' (SiFive Vector Floating-Point Exponential Approximation Instruction). +; CHECK-NEXT: xsfvfexpa64e - 'XSfvfexpa64e' (SiFive Vector Floating-Point Exponential Approximation Instruction with Double-Precision). ; CHECK-NEXT: xsfvfnrclipxfqf - 'XSfvfnrclipxfqf' (SiFive FP32-to-int8 Ranged Clip Instructions). ; CHECK-NEXT: xsfvfwmaccqqq - 'XSfvfwmaccqqq' (SiFive Matrix Multiply Accumulate Instruction (4-by-4)). ; CHECK-NEXT: xsfvqmaccdod - 'XSfvqmaccdod' (SiFive Int8 Matrix Multiplication Instructions (2-by-8 and 8-by-2)). diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/cbuffer.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/cbuffer.ll index 4d32e66..6d41875 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-resources/cbuffer.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/cbuffer.ll @@ -1,5 +1,5 @@ ; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.6-vulkan1.3-library %s -o - | FileCheck %s -; Test that uses of cbuffer members inside ConstantExprs are handled correctly. +; Test that uses of cbuffer members are handled correctly. ; CHECK-DAG: OpDecorate %[[MyCBuffer:[0-9]+]] DescriptorSet 0 ; CHECK-DAG: OpDecorate %[[MyCBuffer]] Binding 0 @@ -37,10 +37,8 @@ entry: ; CHECK: %[[tmp_ptr:[0-9]+]] = OpAccessChain {{%[0-9]+}} %[[tmp]] %[[uint_0]] %[[uint_0]] ; CHECK: %[[v_ptr:.+]] = OpAccessChain %[[_ptr_Uniform_v4float]] %[[tmp]] %[[uint_0]] %[[uint_1]] ; CHECK: %[[s_ptr_gep:[0-9]+]] = OpInBoundsAccessChain %[[_ptr_Uniform_float]] %[[tmp_ptr]] %[[uint_0]] %[[uint_1]] - %gep = getelementptr inbounds %MyStruct, ptr addrspace(12) @s, i32 0, i32 0, i32 1 - ; CHECK: %[[s_val:.+]] = OpLoad %[[float]] %[[s_ptr_gep]] - %load_from_gep = load float, ptr addrspace(12) %gep, align 4 + %load_from_gep = load float, ptr addrspace(12) getelementptr inbounds (%MyStruct, ptr addrspace(12) @s, i32 0, i32 0, i32 1), align 4 ; CHECK: %[[v_val:.+]] = OpLoad %[[v4float]] %[[v_ptr]] %load_v = load <4 x float>, ptr addrspace(12) @v, align 16 diff --git a/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll b/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll index 4a38d7a..c87f113 100644 --- a/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll +++ b/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll @@ -1,7 +1,7 @@ ; Test that combined sin/cos library call is emitted when appropriate ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s --check-prefix=CHECK-OPT -; RUN: llc < %s -mtriple=s390x-linux-gnu -enable-unsafe-fp-math | FileCheck %s --check-prefix=CHECK-OPT +; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s --check-prefix=CHECK-OPT define float @f1(float %x) { ; CHECK-OPT-LABEL: f1: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-vs-unpredicated-copy.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-vs-unpredicated-copy.mir new file mode 100644 index 0000000..5783133 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-vs-unpredicated-copy.mir @@ -0,0 +1,146 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -verify-machineinstrs -o - | FileCheck %s + +# From bug #162644. The _wrong_ output of this test is to generate the +# body of the tail-predicated loop like this: +# +# $q2 = MVE_VORR killed $q0, killed $q0, 0, $noreg, $noreg, undef $q2 +# renamable $r0, renamable $q3 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg, renamable $lr :: (load unknown-size from %ir.13, align 4) +# $q0 = MVE_VORR $q1, $q1, 0, $noreg, $noreg, undef $q0 +# renamable $q0 = MVE_VADDf32 killed renamable $q2, killed renamable $q3, 0, killed $noreg, renamable $lr, killed renamable $q0 +# $lr = MVE_LETP killed renamable $lr, %bb.1 +# +# in which the second MVE_VORR, copying q1 into q0, is an invalid conversion of +# the input MQPRCopy, because it won't copy the vector lanes disabled by +# FPSCR.LTPSIZE, and those are needed in the output value of the loop. +# +# In the right output, that MQPRCopy is expanded into a pair of VMOVD copying +# d2,d3 into d0,d1 respectively, which are unaffected by LTPSIZE. + +--- | + target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" + target triple = "thumbv8.1m.main-unknown-none-eabihf" + + @inactive = dso_local local_unnamed_addr global <4 x float> zeroinitializer, align 16 + + define <4 x float> @test_func(ptr %0, i32 %1) { + %3 = load <4 x float>, ptr @inactive, align 16 + %4 = add i32 %1, 3 + %5 = call i32 @llvm.smin.i32(i32 %1, i32 4) + %6 = sub i32 %4, %5 + %7 = lshr i32 %6, 2 + %8 = add nuw nsw i32 %7, 1 + %9 = call i32 @llvm.start.loop.iterations.i32(i32 %8) + br label %10 + + 10: ; preds = %10, %2 + %11 = phi <4 x float> [ splat (float 0x3FB99999A0000000), %2 ], [ %17, %10 ] + %12 = phi i32 [ %1, %2 ], [ %19, %10 ] + %13 = phi ptr [ %0, %2 ], [ %18, %10 ] + %14 = phi i32 [ %9, %2 ], [ %20, %10 ] + %15 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %12) + %16 = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr %13, i32 4, <4 x i1> %15, <4 x float> zeroinitializer) + %17 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %11, <4 x float> %16, <4 x i1> %15, <4 x float> %3) + %18 = getelementptr inbounds nuw i8, ptr %13, i32 16 + %19 = add i32 %12, -4 + %20 = call i32 @llvm.loop.decrement.reg.i32(i32 %14, i32 1) + %21 = icmp ne i32 %20, 0 + br i1 %21, label %10, label %22 + + 22: ; preds = %10 + ret <4 x float> %17 + } +... +--- +name: test_func +alignment: 4 +legalized: false +tracksRegLiveness: true +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +body: | + ; CHECK-LABEL: name: test_func + ; CHECK: bb.0 (%ir-block.2): + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $lr, $r0, $r1, $r7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK-NEXT: $r2 = t2MOVi16 target-flags(arm-lo16) @inactive, 14 /* CC::al */, $noreg + ; CHECK-NEXT: $r2 = t2MOVTi16 killed $r2, target-flags(arm-hi16) @inactive, 14 /* CC::al */, $noreg + ; CHECK-NEXT: renamable $q1 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg, $noreg :: (dereferenceable load (s128) from @inactive) + ; CHECK-NEXT: $r3 = t2MOVi16 52429, 14 /* CC::al */, $noreg + ; CHECK-NEXT: $r3 = t2MOVTi16 killed $r3, 15820, 14 /* CC::al */, $noreg + ; CHECK-NEXT: renamable $q0 = MVE_VDUP32 killed renamable $r3, 0, $noreg, $noreg, undef renamable $q0 + ; CHECK-NEXT: $lr = MVE_DLSTP_32 killed renamable $r1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1 (%ir-block.10, align 4): + ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK-NEXT: liveins: $lr, $d2, $d3, $q0, $r0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $q2 = MVE_VORR killed $q0, killed $q0, 0, $noreg, $noreg, undef $q2 + ; CHECK-NEXT: renamable $r0, renamable $q3 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg, renamable $lr :: (load unknown-size from %ir.13, align 4) + ; CHECK-NEXT: $d0 = VMOVD $d2, 14 /* CC::al */, $noreg + ; CHECK-NEXT: $d1 = VMOVD $d3, 14 /* CC::al */, $noreg + ; CHECK-NEXT: renamable $q0 = MVE_VADDf32 killed renamable $q2, killed renamable $q3, 0, killed $noreg, renamable $lr, killed renamable $q0 + ; CHECK-NEXT: $lr = MVE_LETP killed renamable $lr, %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2 (%ir-block.22): + ; CHECK-NEXT: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $q0 + bb.0 (%ir-block.2): + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $r7, $lr + + frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + $r2 = t2MOVi16 target-flags(arm-lo16) @inactive, 14 /* CC::al */, $noreg + tCMPi8 renamable $r1, 4, 14 /* CC::al */, $noreg, implicit-def $cpsr + $r2 = t2MOVTi16 killed $r2, target-flags(arm-hi16) @inactive, 14 /* CC::al */, $noreg + renamable $r3 = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg + renamable $q1 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg, $noreg :: (dereferenceable load (s128) from @inactive) + $r2 = tMOVr $r1, 14 /* CC::al */, $noreg + t2IT 10, 8, implicit-def $itstate + renamable $r2 = tMOVi8 $noreg, 4, 10 /* CC::ge */, killed $cpsr, implicit killed renamable $r2, implicit killed $itstate + renamable $r2, dead $cpsr = tSUBrr renamable $r1, killed renamable $r2, 14 /* CC::al */, $noreg + renamable $r2, dead $cpsr = tADDi8 killed renamable $r2, 3, 14 /* CC::al */, $noreg + renamable $r2 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r2, 19, 14 /* CC::al */, $noreg, $noreg + $r3 = t2MOVi16 52429, 14 /* CC::al */, $noreg + $r3 = t2MOVTi16 killed $r3, 15820, 14 /* CC::al */, $noreg + renamable $q0 = MVE_VDUP32 killed renamable $r3, 0, $noreg, $noreg, undef renamable $q0 + renamable $lr = t2DoLoopStartTP killed renamable $r2, renamable $r1 + + bb.1 (%ir-block.10, align 4): + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $lr, $q0, $q1, $r0, $r1 + + renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg, $noreg + $q2 = MQPRCopy killed $q0 + MVE_VPST 8, implicit $vpr + renamable $r0, renamable $q3 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr, renamable $lr :: (load unknown-size from %ir.13, align 4) + $q0 = MQPRCopy $q1 + MVE_VPST 8, implicit $vpr + renamable $q0 = MVE_VADDf32 killed renamable $q2, killed renamable $q3, 1, killed renamable $vpr, renamable $lr, killed renamable $q0 + renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg + renamable $lr = t2LoopEndDec killed renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14 /* CC::al */, $noreg + + bb.2 (%ir-block.22): + liveins: $q0 + + frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $q0 +... diff --git a/llvm/test/CodeGen/Thumb2/pacbti-m-outliner-5.ll b/llvm/test/CodeGen/Thumb2/pacbti-m-outliner-5.ll index 053d6a1..d741411 100644 --- a/llvm/test/CodeGen/Thumb2/pacbti-m-outliner-5.ll +++ b/llvm/test/CodeGen/Thumb2/pacbti-m-outliner-5.ll @@ -94,5 +94,5 @@ attributes #1 = { minsize nofree norecurse nounwind optsize } !llvm.module.flags = !{!0, !1, !2} !0 = !{i32 8, !"branch-target-enforcement", i32 0} -!1 = !{i32 8, !"sign-return-address", i32 1} +!1 = !{i32 8, !"sign-return-address", i32 2} !2 = !{i32 8, !"sign-return-address-all", i32 0} diff --git a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll index 94efe0f..5eb49fd 100644 --- a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll +++ b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll @@ -5,6 +5,7 @@ target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20 %struct.TwoInts = type { i32, i32 } %struct.ThreeInts = type { i32, i32, i32 } %struct.FourInts = type { i32, i32, i32, i32 } +%struct.TwoShorts = type { i16, i16 } %struct.ThreeShorts = type { i16, i16, i16 } %struct.FourShorts = type { i16, i16, i16, i16 } %struct.FiveShorts = type { i16, i16, i16, i16, i16 } @@ -12,6 +13,8 @@ target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20 %struct.ThreeBytes = type { i8, i8, i8 } %struct.FourBytes = type { i8, i8, i8, i8 } %struct.EightBytes = type { i8, i8, i8, i8, i8, i8, i8, i8 } +%struct.TwoFloats = type { float, float } +%struct.FourFloats = type { float, float, float, float } ; CHECK-LABEL: two_ints_same_op: ; CHECK: loop @@ -1536,3 +1539,1605 @@ define hidden void @scale_uv_row_down2_linear(ptr nocapture noundef readonly %0, 34: ; preds = %6, %4 ret void } + +; CHECK-LABEL: two_floats_same_op: +; CHECK-NOT: f32x4.mul +define hidden void @two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp21.not = icmp eq i32 %N, 0 + br i1 %cmp21.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %arrayidx3 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.022 + store float %mul, ptr %arrayidx3, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %mul8 = fmul float %2, %3 + %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4 + store float %mul8, ptr %y10, align 4 + %inc = add nuw i32 %i.022, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_floats_vary_op: +; CHECK-NOT: f32x4 +define hidden void @two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp20.not = icmp eq i32 %N, 0 + br i1 %cmp20.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.021 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.021 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.021 + %1 = load float, ptr %arrayidx1, align 4 + %add = fadd float %0, %1 + %arrayidx3 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.021 + store float %add, ptr %arrayidx3, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %sub = fsub float %2, %3 + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4 + store float %sub, ptr %y9, align 4 + %inc = add nuw i32 %i.021, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_bytes_two_floats_same_op: +; CHECK: loop +; CHECK: v128.load64_zero +; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: v128.load64_zero +; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.mul +; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.mul +; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: v128.store +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: v128.store +define hidden void @two_bytes_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp24.not = icmp eq i32 %N, 0 + br i1 %cmp24.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.025 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoBytes, ptr %a, i32 %i.025 + %0 = load i8, ptr %arrayidx, align 1 + %conv = sitofp i8 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.TwoBytes, ptr %b, i32 %i.025 + %1 = load i8, ptr %arrayidx1, align 1 + %conv3 = sitofp i8 %1 to float + %mul = fmul float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.025 + store float %mul, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1 + %2 = load i8, ptr %y, align 1 + %conv7 = sitofp i8 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1 + %3 = load i8, ptr %y9, align 1 + %conv10 = sitofp i8 %3 to float + %mul11 = fmul float %conv7, %conv10 + %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %mul11, ptr %y13, align 4 + %inc = add nuw i32 %i.025, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_bytes_two_floats_vary_op: +; CHECK: v128.load64_zero +; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: v128.load64_zero +; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.add +; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.sub +; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: v128.store +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: v128.store +define hidden void @two_bytes_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp23.not = icmp eq i32 %N, 0 + br i1 %cmp23.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.024 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoBytes, ptr %a, i32 %i.024 + %0 = load i8, ptr %arrayidx, align 1 + %conv = sitofp i8 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.TwoBytes, ptr %b, i32 %i.024 + %1 = load i8, ptr %arrayidx1, align 1 + %conv3 = sitofp i8 %1 to float + %add = fadd float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.024 + store float %add, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1 + %2 = load i8, ptr %y, align 1 + %conv7 = sitofp i8 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1 + %3 = load i8, ptr %y9, align 1 + %conv10 = sitofp i8 %3 to float + %sub = fsub float %conv7, %conv10 + %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %sub, ptr %y12, align 4 + %inc = add nuw i32 %i.024, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_floats_two_bytes_same_op: +; CHECK: loop +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK: f32x4.mul +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.splat +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +; CHECK: f32x4.mul +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: v128.store64_lane +define hidden void @two_floats_two_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp22.not = icmp eq i32 %N, 0 + br i1 %cmp22.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.023 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.023 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %conv = fptosi float %mul to i8 + %arrayidx3 = getelementptr inbounds nuw %struct.TwoBytes, ptr %res, i32 %i.023 + store i8 %conv, ptr %arrayidx3, align 1 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %mul8 = fmul float %2, %3 + %conv9 = fptosi float %mul8 to i8 + %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1 + store i8 %conv9, ptr %y11, align 1 + %inc = add nuw i32 %i.023, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_floats_two_bytes_vary_op: +; CHECK: loop +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK: f32x4.add +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.splat +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +; CHECK: f32x4.sub +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: v128.store64_lane +define hidden void @two_floats_two_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp21.not = icmp eq i32 %N, 0 + br i1 %cmp21.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022 + %1 = load float, ptr %arrayidx1, align 4 + %add = fadd float %0, %1 + %conv = fptosi float %add to i8 + %arrayidx3 = getelementptr inbounds nuw %struct.TwoBytes, ptr %res, i32 %i.022 + store i8 %conv, ptr %arrayidx3, align 1 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %sub = fsub float %2, %3 + %conv8 = fptosi float %sub to i8 + %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1 + store i8 %conv8, ptr %y10, align 1 + %inc = add nuw i32 %i.022, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_shorts_two_floats_same_op: +; CHECK: loop +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.mul +; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.mul +; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: v128.store +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: v128.store +define hidden void @two_shorts_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp24.not = icmp eq i32 %N, 0 + br i1 %cmp24.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.025 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoShorts, ptr %a, i32 %i.025 + %0 = load i16, ptr %arrayidx, align 2 + %conv = sitofp i16 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.TwoShorts, ptr %b, i32 %i.025 + %1 = load i16, ptr %arrayidx1, align 2 + %conv3 = sitofp i16 %1 to float + %mul = fmul float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.025 + store float %mul, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 + %2 = load i16, ptr %y, align 2 + %conv7 = sitofp i16 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 + %3 = load i16, ptr %y9, align 2 + %conv10 = sitofp i16 %3 to float + %mul11 = fmul float %conv7, %conv10 + %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %mul11, ptr %y13, align 4 + %inc = add nuw i32 %i.025, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_shorts_two_floats_vary_op: +; CHECK: loop +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.add +; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.sub +; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: v128.store +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: v128.store +define hidden void @two_shorts_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp23.not = icmp eq i32 %N, 0 + br i1 %cmp23.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.024 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoShorts, ptr %a, i32 %i.024 + %0 = load i16, ptr %arrayidx, align 2 + %conv = sitofp i16 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.TwoShorts, ptr %b, i32 %i.024 + %1 = load i16, ptr %arrayidx1, align 2 + %conv3 = sitofp i16 %1 to float + %add = fadd float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.024 + store float %add, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 + %2 = load i16, ptr %y, align 2 + %conv7 = sitofp i16 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 + %3 = load i16, ptr %y9, align 2 + %conv10 = sitofp i16 %3 to float + %sub = fsub float %conv7, %conv10 + %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %sub, ptr %y12, align 4 + %inc = add nuw i32 %i.024, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_floats_two_shorts_same_op: +; CHECK: loop +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK: f32x4.mul +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.splat +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +; CHECK: f32x4.mul +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: v128.store +define hidden void @two_floats_two_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp22.not = icmp eq i32 %N, 0 + br i1 %cmp22.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.023 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.023 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %conv = fptosi float %mul to i16 + %arrayidx3 = getelementptr inbounds nuw %struct.TwoShorts, ptr %res, i32 %i.023 + store i16 %conv, ptr %arrayidx3, align 2 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %mul8 = fmul float %2, %3 + %conv9 = fptosi float %mul8 to i16 + %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2 + store i16 %conv9, ptr %y11, align 2 + %inc = add nuw i32 %i.023, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_floats_two_shorts_vary_op: +; CHECK: loop +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK: f32x4.add +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.splat +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +; CHECK: f32x4.sub +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: v128.store +define hidden void @two_floats_two_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp21.not = icmp eq i32 %N, 0 + br i1 %cmp21.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022 + %1 = load float, ptr %arrayidx1, align 4 + %add = fadd float %0, %1 + %conv = fptosi float %add to i16 + %arrayidx3 = getelementptr inbounds nuw %struct.TwoShorts, ptr %res, i32 %i.022 + store i16 %conv, ptr %arrayidx3, align 2 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %sub = fsub float %2, %3 + %conv8 = fptosi float %sub to i16 + %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2 + store i16 %conv8, ptr %y10, align 2 + %inc = add nuw i32 %i.022, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_floats_same_op: +; CHECK: loop +; CHECK-NOT: v128.load +define hidden void @four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp45.not = icmp eq i32 %N, 0 + br i1 %cmp45.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %arrayidx3 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.046 + store float %mul, ptr %arrayidx3, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %mul8 = fmul float %2, %3 + %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4 + store float %mul8, ptr %y10, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8 + %4 = load float, ptr %z, align 4 + %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8 + %5 = load float, ptr %z13, align 4 + %mul14 = fmul float %4, %5 + %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 8 + store float %mul14, ptr %z16, align 4 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12 + %6 = load float, ptr %w, align 4 + %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12 + %7 = load float, ptr %w19, align 4 + %mul20 = fmul float %6, %7 + %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 12 + store float %mul20, ptr %w22, align 4 + %inc = add nuw i32 %i.046, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_floats_vary_op: +; CHECK-NOT: f32x4 +define hidden void @four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp42.not = icmp eq i32 %N, 0 + br i1 %cmp42.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.043 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.043 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.043 + %1 = load float, ptr %arrayidx1, align 4 + %add = fadd float %0, %1 + %arrayidx3 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.043 + store float %add, ptr %arrayidx3, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %sub = fsub float %2, %3 + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4 + store float %sub, ptr %y9, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8 + %4 = load float, ptr %z, align 4 + %z12 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8 + %5 = load float, ptr %z12, align 4 + %mul = fmul float %4, %5 + %z14 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 8 + store float %mul, ptr %z14, align 4 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12 + %6 = load float, ptr %w, align 4 + %w17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12 + %7 = load float, ptr %w17, align 4 + %div = fdiv float %6, %7 + %w19 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 12 + store float %div, ptr %w19, align 4 + %inc = add nuw i32 %i.043, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_bytes_four_floats_same_op: +; CHECK: loop +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.mul +; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.mul +; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.mul +; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.mul +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.store +; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.store +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.store +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.store +define hidden void @four_bytes_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp52.not = icmp eq i32 %N, 0 + br i1 %cmp52.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.053 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourBytes, ptr %a, i32 %i.053 + %0 = load i8, ptr %arrayidx, align 1 + %conv = sitofp i8 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.FourBytes, ptr %b, i32 %i.053 + %1 = load i8, ptr %arrayidx1, align 1 + %conv3 = sitofp i8 %1 to float + %mul = fmul float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.053 + store float %mul, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1 + %2 = load i8, ptr %y, align 1 + %conv7 = sitofp i8 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1 + %3 = load i8, ptr %y9, align 1 + %conv10 = sitofp i8 %3 to float + %mul11 = fmul float %conv7, %conv10 + %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %mul11, ptr %y13, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 + %4 = load i8, ptr %z, align 1 + %conv15 = sitofp i8 %4 to float + %z17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 + %5 = load i8, ptr %z17, align 1 + %conv18 = sitofp i8 %5 to float + %mul19 = fmul float %conv15, %conv18 + %z21 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8 + store float %mul19, ptr %z21, align 4 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 3 + %6 = load i8, ptr %w, align 1 + %conv23 = sitofp i8 %6 to float + %w25 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 3 + %7 = load i8, ptr %w25, align 1 + %conv26 = sitofp i8 %7 to float + %mul27 = fmul float %conv23, %conv26 + %w29 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12 + store float %mul27, ptr %w29, align 4 + %inc = add nuw i32 %i.053, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_bytes_four_floats_vary_op: +; CHECK: loop +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.mul +; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.add +; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.div +; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.sub +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.store +; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.store +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.store +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.store +define hidden void @four_bytes_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp49.not = icmp eq i32 %N, 0 + br i1 %cmp49.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.050 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourBytes, ptr %a, i32 %i.050 + %0 = load i8, ptr %arrayidx, align 1 + %conv = sitofp i8 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.FourBytes, ptr %b, i32 %i.050 + %1 = load i8, ptr %arrayidx1, align 1 + %conv3 = sitofp i8 %1 to float + %mul = fmul float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.050 + store float %mul, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1 + %2 = load i8, ptr %y, align 1 + %conv7 = sitofp i8 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1 + %3 = load i8, ptr %y9, align 1 + %conv10 = sitofp i8 %3 to float + %add = fadd float %conv7, %conv10 + %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %add, ptr %y12, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 + %4 = load i8, ptr %z, align 1 + %conv14 = sitofp i8 %4 to float + %z16 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 + %5 = load i8, ptr %z16, align 1 + %conv17 = sitofp i8 %5 to float + %div = fdiv float %conv14, %conv17 + %z19 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8 + store float %div, ptr %z19, align 4 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 3 + %6 = load i8, ptr %w, align 1 + %conv21 = sitofp i8 %6 to float + %w23 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 3 + %7 = load i8, ptr %w23, align 1 + %conv24 = sitofp i8 %7 to float + %sub = fsub float %conv21, %conv24 + %w26 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12 + store float %sub, ptr %w26, align 4 + %inc = add nuw i32 %i.050, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_floats_four_bytes_same_op: +; CHECK: loop +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: f32x4.mul +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.splat +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: f32x4.mul +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: f32x4.mul +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: f32x4.mul +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: v128.store +define hidden void @four_floats_four_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp48.not = icmp eq i32 %N, 0 + br i1 %cmp48.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.049 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.049 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.049 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %conv = fptosi float %mul to i8 + %arrayidx3 = getelementptr inbounds nuw %struct.FourBytes, ptr %res, i32 %i.049 + store i8 %conv, ptr %arrayidx3, align 1 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %mul8 = fmul float %2, %3 + %conv9 = fptosi float %mul8 to i8 + %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1 + store i8 %conv9, ptr %y11, align 1 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8 + %4 = load float, ptr %z, align 4 + %z14 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8 + %5 = load float, ptr %z14, align 4 + %mul15 = fmul float %4, %5 + %conv16 = fptosi float %mul15 to i8 + %z18 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2 + store i8 %conv16, ptr %z18, align 1 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12 + %6 = load float, ptr %w, align 4 + %w21 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12 + %7 = load float, ptr %w21, align 4 + %mul22 = fmul float %6, %7 + %conv23 = fptosi float %mul22 to i8 + %w25 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 3 + store i8 %conv23, ptr %w25, align 1 + %inc = add nuw i32 %i.049, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_floats_four_bytes_vary_op: +; CHECK: loop +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: f32x4.mul +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.splat +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: f32x4.add +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: f32x4.div +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: f32x4.sub +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: v128.store +define hidden void @four_floats_four_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp45.not = icmp eq i32 %N, 0 + br i1 %cmp45.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %conv = fptosi float %mul to i8 + %arrayidx3 = getelementptr inbounds nuw %struct.FourBytes, ptr %res, i32 %i.046 + store i8 %conv, ptr %arrayidx3, align 1 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %add = fadd float %2, %3 + %conv8 = fptosi float %add to i8 + %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1 + store i8 %conv8, ptr %y10, align 1 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8 + %4 = load float, ptr %z, align 4 + %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8 + %5 = load float, ptr %z13, align 4 + %div = fdiv float %4, %5 + %conv14 = fptosi float %div to i8 + %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2 + store i8 %conv14, ptr %z16, align 1 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12 + %6 = load float, ptr %w, align 4 + %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12 + %7 = load float, ptr %w19, align 4 + %sub = fsub float %6, %7 + %conv20 = fptosi float %sub to i8 + %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 3 + store i8 %conv20, ptr %w22, align 1 + %inc = add nuw i32 %i.046, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_shorts_four_floats_same_op: +; CHECK: loop +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.mul +; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.mul +; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.mul +; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.mul +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.store +; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.store +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.store +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.store +define hidden void @four_shorts_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp52.not = icmp eq i32 %N, 0 + br i1 %cmp52.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.053 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourShorts, ptr %a, i32 %i.053 + %0 = load i16, ptr %arrayidx, align 2 + %conv = sitofp i16 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.FourShorts, ptr %b, i32 %i.053 + %1 = load i16, ptr %arrayidx1, align 2 + %conv3 = sitofp i16 %1 to float + %mul = fmul float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.053 + store float %mul, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 + %2 = load i16, ptr %y, align 2 + %conv7 = sitofp i16 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 + %3 = load i16, ptr %y9, align 2 + %conv10 = sitofp i16 %3 to float + %mul11 = fmul float %conv7, %conv10 + %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %mul11, ptr %y13, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %4 = load i16, ptr %z, align 2 + %conv15 = sitofp i16 %4 to float + %z17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %5 = load i16, ptr %z17, align 2 + %conv18 = sitofp i16 %5 to float + %mul19 = fmul float %conv15, %conv18 + %z21 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8 + store float %mul19, ptr %z21, align 4 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 6 + %6 = load i16, ptr %w, align 2 + %conv23 = sitofp i16 %6 to float + %w25 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 6 + %7 = load i16, ptr %w25, align 2 + %conv26 = sitofp i16 %7 to float + %mul27 = fmul float %conv23, %conv26 + %w29 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12 + store float %mul27, ptr %w29, align 4 + %inc = add nuw i32 %i.053, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_shorts_four_floats_vary_op: +; CHECK: loop +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.mul +; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.add +; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.div +; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.sub +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.store +; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.store +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.store +define hidden void @four_shorts_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp49.not = icmp eq i32 %N, 0 + br i1 %cmp49.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.050 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourShorts, ptr %a, i32 %i.050 + %0 = load i16, ptr %arrayidx, align 2 + %conv = sitofp i16 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.FourShorts, ptr %b, i32 %i.050 + %1 = load i16, ptr %arrayidx1, align 2 + %conv3 = sitofp i16 %1 to float + %mul = fmul float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.050 + store float %mul, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 + %2 = load i16, ptr %y, align 2 + %conv7 = sitofp i16 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 + %3 = load i16, ptr %y9, align 2 + %conv10 = sitofp i16 %3 to float + %add = fadd float %conv7, %conv10 + %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %add, ptr %y12, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %4 = load i16, ptr %z, align 2 + %conv14 = sitofp i16 %4 to float + %z16 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %5 = load i16, ptr %z16, align 2 + %conv17 = sitofp i16 %5 to float + %div = fdiv float %conv14, %conv17 + %z19 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8 + store float %div, ptr %z19, align 4 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 6 + %6 = load i16, ptr %w, align 2 + %conv21 = sitofp i16 %6 to float + %w23 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 6 + %7 = load i16, ptr %w23, align 2 + %conv24 = sitofp i16 %7 to float + %sub = fsub float %conv21, %conv24 + %w26 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12 + store float %sub, ptr %w26, align 4 + %inc = add nuw i32 %i.050, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_floats_four_shorts_same_op: +; CHECK: loop +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: f32x4.mul +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.splat +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: f32x4.mul +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: f32x4.mul +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: f32x4.mul +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: v128.store +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.splat +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: v128.store +define hidden void @four_floats_four_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp48.not = icmp eq i32 %N, 0 + br i1 %cmp48.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.049 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.049 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.049 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %conv = fptosi float %mul to i16 + %arrayidx3 = getelementptr inbounds nuw %struct.FourShorts, ptr %res, i32 %i.049 + store i16 %conv, ptr %arrayidx3, align 2 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %mul8 = fmul float %2, %3 + %conv9 = fptosi float %mul8 to i16 + %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2 + store i16 %conv9, ptr %y11, align 2 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8 + %4 = load float, ptr %z, align 4 + %z14 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8 + %5 = load float, ptr %z14, align 4 + %mul15 = fmul float %4, %5 + %conv16 = fptosi float %mul15 to i16 + %z18 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4 + store i16 %conv16, ptr %z18, align 2 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12 + %6 = load float, ptr %w, align 4 + %w21 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12 + %7 = load float, ptr %w21, align 4 + %mul22 = fmul float %6, %7 + %conv23 = fptosi float %mul22 to i16 + %w25 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 6 + store i16 %conv23, ptr %w25, align 2 + %inc = add nuw i32 %i.049, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_floats_four_shorts_vary_op: +; CHECK: loop +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: f32x4.mul +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.splat +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: f32x4.add +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: f32x4.div +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: f32x4.sub +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: v128.store +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.splat +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: v128.store +define hidden void @four_floats_four_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp45.not = icmp eq i32 %N, 0 + br i1 %cmp45.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %conv = fptosi float %mul to i16 + %arrayidx3 = getelementptr inbounds nuw %struct.FourShorts, ptr %res, i32 %i.046 + store i16 %conv, ptr %arrayidx3, align 2 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %add = fadd float %2, %3 + %conv8 = fptosi float %add to i16 + %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2 + store i16 %conv8, ptr %y10, align 2 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8 + %4 = load float, ptr %z, align 4 + %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8 + %5 = load float, ptr %z13, align 4 + %div = fdiv float %4, %5 + %conv14 = fptosi float %div to i16 + %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4 + store i16 %conv14, ptr %z16, align 2 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12 + %6 = load float, ptr %w, align 4 + %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12 + %7 = load float, ptr %w19, align 4 + %sub = fsub float %6, %7 + %conv20 = fptosi float %sub to i16 + %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 6 + store i16 %conv20, ptr %w22, align 2 + %inc = add nuw i32 %i.046, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmax.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmax.ll new file mode 100644 index 0000000..45f4ddd --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmax.ll @@ -0,0 +1,60 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 + +; RUN: llc < %s -mtriple=wasm32-unknown-unknown -mattr=+simd128,+relaxed-simd | FileCheck %s + +; Test that fmaxnum and fmaximumnum get transformed to relaxed_max + +target triple = "wasm32" + +define <4 x float> @test_maxnum_f32x4(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test_maxnum_f32x4: +; CHECK: .functype test_maxnum_f32x4 (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: f32x4.relaxed_max +; CHECK-NEXT: # fallthrough-return + %result = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b) + ret <4 x float> %result +} + +define <4 x float> @test_maximumnum_f32x4(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test_maximumnum_f32x4: +; CHECK: .functype test_maximumnum_f32x4 (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: f32x4.relaxed_max +; CHECK-NEXT: # fallthrough-return + %result = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %a, <4 x float> %b) + ret <4 x float> %result +} + +define <2 x double> @test_maxnum_f64x2(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: test_maxnum_f64x2: +; CHECK: .functype test_maxnum_f64x2 (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: f64x2.relaxed_max +; CHECK-NEXT: # fallthrough-return + %result = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %a, <2 x double> %b) + ret <2 x double> %result +} + +define <2 x double> @test_minimumnum_f64x2(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: test_minimumnum_f64x2: +; CHECK: .functype test_minimumnum_f64x2 (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: f64x2.relaxed_max +; CHECK-NEXT: # fallthrough-return + %result = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> %a, <2 x double> %b) + ret <2 x double> %result +} + +declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) +declare <4 x float> @llvm.maximumnum.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>) +declare <2 x double> @llvm.maximumnum.v2f64(<2 x double>, <2 x double>) diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmin.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmin.ll new file mode 100644 index 0000000..f3eec02 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmin.ll @@ -0,0 +1,59 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=wasm32-unknown-unknown -mattr=+simd128,+relaxed-simd | FileCheck %s + +; Test that fminnum and fminimumnum get transformed to relaxed_min + +target triple = "wasm32" + +define <4 x float> @test_minnum_f32x4(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test_minnum_f32x4: +; CHECK: .functype test_minnum_f32x4 (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: f32x4.relaxed_min +; CHECK-NEXT: # fallthrough-return + %result = call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %b) + ret <4 x float> %result +} + +define <4 x float> @test_minimumnum_f32x4(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test_minimumnum_f32x4: +; CHECK: .functype test_minimumnum_f32x4 (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: f32x4.relaxed_min +; CHECK-NEXT: # fallthrough-return + %result = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> %a, <4 x float> %b) + ret <4 x float> %result +} + +define <2 x double> @test_minnum_f64x2(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: test_minnum_f64x2: +; CHECK: .functype test_minnum_f64x2 (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: f64x2.relaxed_min +; CHECK-NEXT: # fallthrough-return + %result = call <2 x double> @llvm.minnum.v2f64(<2 x double> %a, <2 x double> %b) + ret <2 x double> %result +} + +define <2 x double> @test_minimumnum_f64x2(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: test_minimumnum_f64x2: +; CHECK: .functype test_minimumnum_f64x2 (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: f64x2.relaxed_min +; CHECK-NEXT: # fallthrough-return + %result = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %a, <2 x double> %b) + ret <2 x double> %result +} + +declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) +declare <4 x float> @llvm.fminimumnum.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>) +declare <2 x double> @llvm.fminimumnum.v2f64(<2 x double>, <2 x double>) diff --git a/llvm/test/CodeGen/WebAssembly/simd-vector-trunc.ll b/llvm/test/CodeGen/WebAssembly/simd-vector-trunc.ll index 123438d..f58456b 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-vector-trunc.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-vector-trunc.ll @@ -94,6 +94,19 @@ entry: ret <16 x i8> %0 } +define <8 x i8> @trunc8i16_8i8(<8 x i16> %a) { +; CHECK-LABEL: trunc8i16_8i8: +; CHECK: .functype trunc8i16_8i8 (v128) -> (v128) +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK-NEXT: # fallthrough-return +entry: + %0 = trunc <8 x i16> %a to <8 x i8> + ret <8 x i8> %0 +} + define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) { ; CHECK-LABEL: trunc8i64_8i16: ; CHECK: .functype trunc8i64_8i16 (v128, v128, v128, v128) -> (v128) @@ -139,3 +152,29 @@ entry: %0 = trunc <8 x i32> %a to <8 x i16> ret <8 x i16> %0 } + +define <4 x i16> @trunc4i32_4i16(<4 x i32> %a) { +; CHECK-LABEL: trunc4i32_4i16: +; CHECK: .functype trunc4i32_4i16 (v128) -> (v128) +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK-NEXT: # fallthrough-return +entry: + %0 = trunc <4 x i32> %a to <4 x i16> + ret <4 x i16> %0 +} + +define <4 x i8> @trunc4i32_4i8(<4 x i32> %a) { +; CHECK-LABEL: trunc4i32_4i8: +; CHECK: .functype trunc4i32_4i8 (v128) -> (v128) +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.shuffle 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK-NEXT: # fallthrough-return +entry: + %0 = trunc <4 x i32> %a to <4 x i8> + ret <4 x i8> %0 +} diff --git a/llvm/test/CodeGen/X86/2006-05-22-FPSetEQ.ll b/llvm/test/CodeGen/X86/2006-05-22-FPSetEQ.ll index bea11e9..940fe8c 100644 --- a/llvm/test/CodeGen/X86/2006-05-22-FPSetEQ.ll +++ b/llvm/test/CodeGen/X86/2006-05-22-FPSetEQ.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=i686-- -mattr=-sse | FileCheck %s -check-prefix=WITHNANS -; RUN: llc < %s -mtriple=i686-- -mattr=-sse -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck %s -check-prefix=NONANS +; RUN: llc < %s -mtriple=i686-- -mattr=-sse -enable-no-nans-fp-math | FileCheck %s -check-prefix=NONANS ; WITHNANS-LABEL: test: ; WITHNANS: setnp diff --git a/llvm/test/CodeGen/X86/2008-05-01-InvalidOrdCompare.ll b/llvm/test/CodeGen/X86/2008-05-01-InvalidOrdCompare.ll index 8411a40..ff7a99a 100644 --- a/llvm/test/CodeGen/X86/2008-05-01-InvalidOrdCompare.ll +++ b/llvm/test/CodeGen/X86/2008-05-01-InvalidOrdCompare.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -enable-unsafe-fp-math -mtriple=i686-- | FileCheck %s +; RUN: llc < %s -mtriple=i686-- | FileCheck %s ; rdar://5902801 declare void @test2() diff --git a/llvm/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll b/llvm/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll index 6ebbb2e..0e0e20f 100644 --- a/llvm/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll +++ b/llvm/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -enable-unsafe-fp-math +; RUN: llc < %s ; <rdar://problem/12180135> target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128" target triple = "i386-apple-macosx10.8.0" diff --git a/llvm/test/CodeGen/X86/avx-minmax.ll b/llvm/test/CodeGen/X86/avx-minmax.ll index 6da04c5..8e4b6c6 100644 --- a/llvm/test/CodeGen/X86/avx-minmax.ll +++ b/llvm/test/CodeGen/X86/avx-minmax.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx -enable-no-nans-fp-math | FileCheck %s define <2 x double> @maxpd(<2 x double> %x, <2 x double> %y) { ; CHECK-LABEL: maxpd: diff --git a/llvm/test/CodeGen/X86/avx512-unsafe-fp-math.ll b/llvm/test/CodeGen/X86/avx512-unsafe-fp-math.ll index f827998..eb9de8a 100644 --- a/llvm/test/CodeGen/X86/avx512-unsafe-fp-math.ll +++ b/llvm/test/CodeGen/X86/avx512-unsafe-fp-math.ll @@ -2,7 +2,7 @@ ; RUN: llc < %s -mtriple=x86_64 -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mattr=+avx512f | FileCheck %s --check-prefix=CHECK_UNSAFE ; RUN: llc < %s -mtriple=x86_64 -enable-no-nans-fp-math -mattr=+avx512f | FileCheck %s ; RUN: llc < %s -mtriple=x86_64 -enable-no-signed-zeros-fp-math -mattr=+avx512f | FileCheck %s -; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx512f | FileCheck %s +; RUN: llc < %s -mtriple=x86_64 -mattr=+avx512f | FileCheck %s ; RUN: llc < %s -mtriple=x86_64 -mattr=+avx512f | FileCheck %s define <16 x float> @test_max_v16f32(ptr %a_ptr, <16 x float> %b) { diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll index 5d9784a..1147d79 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl --enable-unsafe-fp-math | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl | FileCheck %s define dso_local <32 x half> @test1(<32 x half> %acc.coerce, <32 x half> %lhs.coerce, <32 x half> %rhs.coerce) { ; CHECK-LABEL: test1: diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll index b58bae9..1c4d9c6 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl --enable-unsafe-fp-math | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl | FileCheck %s define dso_local <32 x half> @test1(<32 x half> %acc.coerce, <32 x half> %lhs.coerce.conj, <32 x half> %rhs.coerce) local_unnamed_addr #0 { ; CHECK-LABEL: test1: diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll index 92bdebb..a8ff969 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl --enable-unsafe-fp-math | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl | FileCheck %s define dso_local <32 x half> @test1(<32 x half> %lhs.coerce.conj, <32 x half> %rhs.coerce) local_unnamed_addr #0 { ; CHECK-LABEL: test1: diff --git a/llvm/test/CodeGen/X86/bf16-fast-isel.ll b/llvm/test/CodeGen/X86/bf16-fast-isel.ll new file mode 100644 index 0000000..c659e0e --- /dev/null +++ b/llvm/test/CodeGen/X86/bf16-fast-isel.ll @@ -0,0 +1,66 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc --fast-isel < %s -mtriple=x86_64-unknown-unknown | FileCheck %s + +define i8 @test_direct_call(ptr %f) nounwind { +; CHECK-LABEL: test_direct_call: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq foo@PLT +; CHECK-NEXT: callq bar@PLT +; CHECK-NEXT: popq %rcx +; CHECK-NEXT: retq +entry: + %call = call bfloat @foo(ptr %f) + %call2 = call zeroext i8 @bar(bfloat %call) + ret i8 %call2 +} + +define i8 @test_fast_direct_call(ptr %f) nounwind { +; CHECK-LABEL: test_fast_direct_call: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq foo_fast@PLT +; CHECK-NEXT: callq bar@PLT +; CHECK-NEXT: popq %rcx +; CHECK-NEXT: retq +entry: + %call = call fastcc bfloat @foo_fast(ptr %f) + %call2 = call zeroext i8 @bar(bfloat %call) + ret i8 %call2 +} + +define i8 @test_indirect_all(ptr %fptr, ptr %f) nounwind { +; CHECK-LABEL: test_indirect_all: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: movq %rsi, %rdi +; CHECK-NEXT: callq foo@PLT +; CHECK-NEXT: callq *%rbx +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: retq +entry: + %call = call bfloat @foo(ptr %f) + %call2 = call zeroext i8 %fptr(bfloat %call) + ret i8 %call2 +} + +define i8 @test_fast_indirect_all(ptr %fptr, ptr %f) nounwind { +; CHECK-LABEL: test_fast_indirect_all: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: movq %rsi, %rdi +; CHECK-NEXT: callq foo@PLT +; CHECK-NEXT: callq *%rbx +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: retq +entry: + %call = call fastcc bfloat @foo(ptr %f) + %call2 = call zeroext i8 %fptr(bfloat %call) + ret i8 %call2 +} + +declare bfloat @foo(ptr %f) +declare zeroext i8 @bar(bfloat) +declare fastcc bfloat @foo_fast(ptr %f) diff --git a/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll index 632d90d..f36baba 100644 --- a/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll +++ b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll @@ -27,7 +27,7 @@ entry: !1 = !{i64 0, !"_ZTSFivE.generalized"} !2 = !{i64 0, !"_ZTSFviE.generalized"} -; CHECK: .section .llvm.callgraph,"o",@progbits,.text +; CHECK: .section .llvm.callgraph,"o",@llvm_call_graph,.text ;; Version ; CHECK-NEXT: .byte 0 ;; Flags -- Potential indirect target so LSB is set to 1. Other bits are 0. diff --git a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll index ed6849a..cdbad66 100644 --- a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll +++ b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll @@ -36,7 +36,7 @@ entry: !4 = !{!5} !5 = !{i64 0, !"_ZTSFPvS_E.generalized"} -; CHECK: .section .llvm.callgraph,"o",@progbits,.text +; CHECK: .section .llvm.callgraph,"o",@llvm_call_graph,.text ;; Version ; CHECK-NEXT: .byte 0 ;; Flags diff --git a/llvm/test/CodeGen/X86/dag-fmf-cse.ll b/llvm/test/CodeGen/X86/dag-fmf-cse.ll index 609ccdc..cdcc082 100644 --- a/llvm/test/CodeGen/X86/dag-fmf-cse.ll +++ b/llvm/test/CodeGen/X86/dag-fmf-cse.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=fma -enable-unsafe-fp-math | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=fma | FileCheck %s ; If fast-math-flags are propagated correctly, the mul1 expression ; should be recognized as a factor in the last fsub, so we should diff --git a/llvm/test/CodeGen/X86/fabs.ll b/llvm/test/CodeGen/X86/fabs.ll index 82c82ac..4e6da83 100644 --- a/llvm/test/CodeGen/X86/fabs.ll +++ b/llvm/test/CodeGen/X86/fabs.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=-sse,-sse2,-sse3 | FileCheck %s --check-prefix=X87 -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck %s --check-prefix=X87UNSAFE +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=-sse,-sse2,-sse3 -enable-no-nans-fp-math | FileCheck %s --check-prefix=X87UNSAFE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 declare float @fabsf(float) diff --git a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll index 0fe107c..aae6cda 100644 --- a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll +++ b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll @@ -22,25 +22,24 @@ declare <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat>, <4 x bfloat>) define float @test_fmaximumnum(float %x, float %y) nounwind { ; SSE2-LABEL: test_fmaximumnum: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: js .LBB0_2 -; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: js .LBB0_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: jmp .LBB0_3 +; SSE2-NEXT: .LBB0_1: +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: .LBB0_3: ; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: .LBB0_2: -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: cmpordss %xmm3, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm4 -; SSE2-NEXT: andps %xmm3, %xmm4 -; SSE2-NEXT: js .LBB0_4 -; SSE2-NEXT: # %bb.3: -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: .LBB0_4: -; SSE2-NEXT: maxss %xmm1, %xmm3 -; SSE2-NEXT: andnps %xmm3, %xmm0 -; SSE2-NEXT: orps %xmm4, %xmm0 +; SSE2-NEXT: maxss %xmm2, %xmm3 +; SSE2-NEXT: movaps %xmm3, %xmm0 +; SSE2-NEXT: cmpunordss %xmm3, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm3, %xmm2 +; SSE2-NEXT: andps %xmm1, %xmm0 +; SSE2-NEXT: orps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: test_fmaximumnum: @@ -56,7 +55,7 @@ define float @test_fmaximumnum(float %x, float %y) nounwind { ; AVX1-NEXT: vmovdqa %xmm0, %xmm1 ; AVX1-NEXT: .LBB0_3: ; AVX1-NEXT: vmaxss %xmm2, %xmm1, %xmm0 -; AVX1-NEXT: vcmpordss %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 ; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -70,7 +69,7 @@ define float @test_fmaximumnum(float %x, float %y) nounwind { ; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} ; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0 -; AVX512-NEXT: vcmpordss %xmm1, %xmm1, %k1 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: retq ; @@ -95,7 +94,7 @@ define float @test_fmaximumnum(float %x, float %y) nounwind { ; X86-NEXT: vmovdqa %xmm2, %xmm0 ; X86-NEXT: .LBB0_3: ; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1 -; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) @@ -371,26 +370,25 @@ define float @test_fmaximumnum_nsz(float %x, float %y) "no-signed-zeros-fp-math" ; SSE2-LABEL: test_fmaximumnum_nsz: ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: cmpordss %xmm0, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm3 -; SSE2-NEXT: andps %xmm0, %xmm3 -; SSE2-NEXT: maxss %xmm1, %xmm0 -; SSE2-NEXT: andnps %xmm0, %xmm2 -; SSE2-NEXT: orps %xmm3, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: maxss %xmm1, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm1 +; SSE2-NEXT: cmpunordss %xmm2, %xmm1 +; SSE2-NEXT: andps %xmm1, %xmm0 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: test_fmaximumnum_nsz: ; AVX1: # %bb.0: ; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: test_fmaximumnum_nsz: ; AVX512: # %bb.0: ; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vcmpordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vcmpunordss %xmm1, %xmm1, %k1 ; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vmovaps %xmm1, %xmm0 ; AVX512-NEXT: retq @@ -404,9 +402,9 @@ define float @test_fmaximumnum_nsz(float %x, float %y) "no-signed-zeros-fp-math" ; X86: # %bb.0: ; X86-NEXT: pushl %eax ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm1 -; X86-NEXT: vmaxss {{[0-9]+}}(%esp), %xmm0, %xmm2 -; X86-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0 +; X86-NEXT: vmaxss {{[0-9]+}}(%esp), %xmm0, %xmm1 +; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: popl %eax @@ -421,23 +419,22 @@ define float @test_fmaximumnum_combine_cmps(float %x, float %y) nounwind { ; SSE2-NEXT: divss %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: movaps %xmm0, %xmm3 -; SSE2-NEXT: js .LBB9_2 -; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: movaps %xmm1, %xmm3 -; SSE2-NEXT: .LBB9_2: -; SSE2-NEXT: movaps %xmm3, %xmm2 -; SSE2-NEXT: cmpordss %xmm3, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm4 -; SSE2-NEXT: andps %xmm3, %xmm4 -; SSE2-NEXT: js .LBB9_4 -; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: js .LBB9_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: jmp .LBB9_3 +; SSE2-NEXT: .LBB9_1: +; SSE2-NEXT: movaps %xmm1, %xmm2 ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: .LBB9_4: -; SSE2-NEXT: maxss %xmm1, %xmm3 +; SSE2-NEXT: .LBB9_3: +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: maxss %xmm2, %xmm3 +; SSE2-NEXT: movaps %xmm3, %xmm0 +; SSE2-NEXT: cmpunordss %xmm3, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm2 ; SSE2-NEXT: andnps %xmm3, %xmm2 -; SSE2-NEXT: orps %xmm4, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: andps %xmm1, %xmm0 +; SSE2-NEXT: orps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: test_fmaximumnum_combine_cmps: @@ -454,7 +451,7 @@ define float @test_fmaximumnum_combine_cmps(float %x, float %y) nounwind { ; AVX1-NEXT: vmovaps %xmm0, %xmm1 ; AVX1-NEXT: .LBB9_3: ; AVX1-NEXT: vmaxss %xmm2, %xmm1, %xmm0 -; AVX1-NEXT: vcmpordss %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 ; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -469,7 +466,7 @@ define float @test_fmaximumnum_combine_cmps(float %x, float %y) nounwind { ; AVX512F-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} ; AVX512F-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} ; AVX512F-NEXT: vmaxss %xmm2, %xmm1, %xmm0 -; AVX512F-NEXT: vcmpordss %xmm1, %xmm1, %k1 +; AVX512F-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512F-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512F-NEXT: retq ; @@ -507,7 +504,7 @@ define float @test_fmaximumnum_combine_cmps(float %x, float %y) nounwind { ; X86-NEXT: vmovaps %xmm1, %xmm0 ; X86-NEXT: .LBB9_3: ; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1 -; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) @@ -527,23 +524,23 @@ define float @test_fminimumnum(float %x, float %y) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: js .LBB10_2 -; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: .LBB10_2: -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: cmpordss %xmm3, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm4 -; SSE2-NEXT: andps %xmm3, %xmm4 -; SSE2-NEXT: js .LBB10_4 -; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: js .LBB10_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: jmp .LBB10_3 +; SSE2-NEXT: .LBB10_1: +; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: .LBB10_4: -; SSE2-NEXT: minss %xmm0, %xmm3 +; SSE2-NEXT: .LBB10_3: +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: minss %xmm2, %xmm3 +; SSE2-NEXT: movaps %xmm3, %xmm1 +; SSE2-NEXT: cmpunordss %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm2 ; SSE2-NEXT: andnps %xmm3, %xmm2 -; SSE2-NEXT: orps %xmm4, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: orps %xmm2, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: test_fminimumnum: @@ -559,7 +556,7 @@ define float @test_fminimumnum(float %x, float %y) nounwind { ; AVX1-NEXT: vmovdqa %xmm1, %xmm0 ; AVX1-NEXT: .LBB10_3: ; AVX1-NEXT: vminss %xmm2, %xmm0, %xmm1 -; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -573,7 +570,7 @@ define float @test_fminimumnum(float %x, float %y) nounwind { ; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1} ; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: vminss %xmm2, %xmm0, %xmm1 -; AVX512-NEXT: vcmpordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vcmpunordss %xmm1, %xmm1, %k1 ; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vmovaps %xmm1, %xmm0 ; AVX512-NEXT: retq @@ -599,7 +596,7 @@ define float @test_fminimumnum(float %x, float %y) nounwind { ; X86-NEXT: vmovdqa %xmm1, %xmm0 ; X86-NEXT: .LBB10_3: ; X86-NEXT: vminss %xmm2, %xmm0, %xmm1 -; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) @@ -857,26 +854,25 @@ define float @test_fminimumnum_nsz(float %x, float %y) nounwind { ; SSE2-LABEL: test_fminimumnum_nsz: ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: cmpordss %xmm0, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm3 -; SSE2-NEXT: andps %xmm0, %xmm3 -; SSE2-NEXT: minss %xmm1, %xmm0 -; SSE2-NEXT: andnps %xmm0, %xmm2 -; SSE2-NEXT: orps %xmm3, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: minss %xmm1, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm1 +; SSE2-NEXT: cmpunordss %xmm2, %xmm1 +; SSE2-NEXT: andps %xmm1, %xmm0 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: test_fminimumnum_nsz: ; AVX1: # %bb.0: ; AVX1-NEXT: vminss %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: test_fminimumnum_nsz: ; AVX512: # %bb.0: ; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vcmpordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vcmpunordss %xmm1, %xmm1, %k1 ; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vmovaps %xmm1, %xmm0 ; AVX512-NEXT: retq @@ -890,9 +886,9 @@ define float @test_fminimumnum_nsz(float %x, float %y) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %eax ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm1 -; X86-NEXT: vminss {{[0-9]+}}(%esp), %xmm0, %xmm2 -; X86-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0 +; X86-NEXT: vminss {{[0-9]+}}(%esp), %xmm0, %xmm1 +; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: popl %eax @@ -907,23 +903,23 @@ define float @test_fminimumnum_combine_cmps(float %x, float %y) nounwind { ; SSE2-NEXT: divss %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: movaps %xmm1, %xmm3 -; SSE2-NEXT: js .LBB19_2 -; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: movaps %xmm0, %xmm3 -; SSE2-NEXT: .LBB19_2: -; SSE2-NEXT: movaps %xmm3, %xmm2 -; SSE2-NEXT: cmpordss %xmm3, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm4 -; SSE2-NEXT: andps %xmm3, %xmm4 -; SSE2-NEXT: js .LBB19_4 -; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: js .LBB19_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: jmp .LBB19_3 +; SSE2-NEXT: .LBB19_1: +; SSE2-NEXT: movaps %xmm0, %xmm2 ; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: .LBB19_4: -; SSE2-NEXT: minss %xmm0, %xmm3 +; SSE2-NEXT: .LBB19_3: +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: minss %xmm2, %xmm3 +; SSE2-NEXT: movaps %xmm3, %xmm1 +; SSE2-NEXT: cmpunordss %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm2 ; SSE2-NEXT: andnps %xmm3, %xmm2 -; SSE2-NEXT: orps %xmm4, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: orps %xmm2, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: test_fminimumnum_combine_cmps: @@ -940,7 +936,7 @@ define float @test_fminimumnum_combine_cmps(float %x, float %y) nounwind { ; AVX1-NEXT: vmovaps %xmm2, %xmm0 ; AVX1-NEXT: .LBB19_3: ; AVX1-NEXT: vminss %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -955,7 +951,7 @@ define float @test_fminimumnum_combine_cmps(float %x, float %y) nounwind { ; AVX512F-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1} ; AVX512F-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512F-NEXT: vminss %xmm2, %xmm0, %xmm1 -; AVX512F-NEXT: vcmpordss %xmm0, %xmm0, %k1 +; AVX512F-NEXT: vcmpunordss %xmm1, %xmm1, %k1 ; AVX512F-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} ; AVX512F-NEXT: vmovaps %xmm1, %xmm0 ; AVX512F-NEXT: retq @@ -994,7 +990,7 @@ define float @test_fminimumnum_combine_cmps(float %x, float %y) nounwind { ; X86-NEXT: vmovaps %xmm2, %xmm0 ; X86-NEXT: .LBB19_3: ; X86-NEXT: vminss %xmm1, %xmm0, %xmm1 -; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) @@ -1022,9 +1018,9 @@ define <2 x double> @test_fminimumnum_vector(<2 x double> %x, <2 x double> %y) { ; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm1 ; SSE2-NEXT: minpd %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: cmpordpd %xmm3, %xmm0 -; SSE2-NEXT: andpd %xmm0, %xmm3 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: cmpunordpd %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: andnpd %xmm1, %xmm0 ; SSE2-NEXT: orpd %xmm3, %xmm0 ; SSE2-NEXT: retq @@ -1034,7 +1030,7 @@ define <2 x double> @test_fminimumnum_vector(<2 x double> %x, <2 x double> %y) { ; AVX-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm2 ; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vminpd %xmm2, %xmm0, %xmm1 -; AVX-NEXT: vcmpordpd %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vcmpunordpd %xmm1, %xmm1, %xmm2 ; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; @@ -1048,7 +1044,7 @@ define <2 x double> @test_fminimumnum_vector(<2 x double> %x, <2 x double> %y) { ; X86-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm2 ; X86-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0 ; X86-NEXT: vminpd %xmm2, %xmm0, %xmm1 -; X86-NEXT: vcmpordpd %xmm0, %xmm0, %xmm2 +; X86-NEXT: vcmpunordpd %xmm1, %xmm1, %xmm2 ; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: retl %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> %y) @@ -1084,19 +1080,17 @@ define <2 x double> @test_fminimumnum_vector_zero(<2 x double> %x) { ; SSE2: # %bb.0: ; SSE2-NEXT: xorpd %xmm1, %xmm1 ; SSE2-NEXT: minpd %xmm0, %xmm1 -; SSE2-NEXT: movapd %xmm0, %xmm2 -; SSE2-NEXT: cmpordpd %xmm0, %xmm2 -; SSE2-NEXT: andpd %xmm2, %xmm0 -; SSE2-NEXT: andnpd %xmm1, %xmm2 -; SSE2-NEXT: orpd %xmm2, %xmm0 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: cmpunordpd %xmm1, %xmm0 +; SSE2-NEXT: andnpd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_fminimumnum_vector_zero: ; AVX: # %bb.0: ; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vcmpordpd %xmm0, %xmm0, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vandnpd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; ; AVX10_2-LABEL: test_fminimumnum_vector_zero: @@ -1108,9 +1102,9 @@ define <2 x double> @test_fminimumnum_vector_zero(<2 x double> %x) { ; X86-LABEL: test_fminimumnum_vector_zero: ; X86: # %bb.0: ; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; X86-NEXT: vminpd %xmm0, %xmm1, %xmm1 -; X86-NEXT: vcmpordpd %xmm0, %xmm0, %xmm2 -; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-NEXT: vminpd %xmm0, %xmm1, %xmm0 +; X86-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm1 +; X86-NEXT: vandnpd %xmm0, %xmm1, %xmm0 ; X86-NEXT: retl %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double 0.>) ret <2 x double> %r @@ -1120,20 +1114,21 @@ define <4 x float> @test_fmaximumnum_vector_signed_zero(<4 x float> %x) { ; SSE2-LABEL: test_fmaximumnum_vector_signed_zero: ; SSE2: # %bb.0: ; SSE2-NEXT: movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; SSE2-NEXT: maxps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: cmpordps %xmm0, %xmm2 -; SSE2-NEXT: andps %xmm2, %xmm0 -; SSE2-NEXT: andnps %xmm1, %xmm2 -; SSE2-NEXT: orps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: maxps %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: cmpunordps %xmm2, %xmm0 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: andnps %xmm2, %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_fmaximumnum_vector_signed_zero: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vcmpordps %xmm0, %xmm0, %xmm2 -; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vcmpunordps %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX10_2-LABEL: test_fmaximumnum_vector_signed_zero: @@ -1144,9 +1139,9 @@ define <4 x float> @test_fmaximumnum_vector_signed_zero(<4 x float> %x) { ; X86-LABEL: test_fmaximumnum_vector_signed_zero: ; X86: # %bb.0: ; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; X86-NEXT: vmaxps %xmm0, %xmm1, %xmm1 -; X86-NEXT: vcmpordps %xmm0, %xmm0, %xmm2 -; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; X86-NEXT: vmaxps %xmm0, %xmm1, %xmm0 +; X86-NEXT: vcmpunordps %xmm0, %xmm0, %xmm2 +; X86-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ; X86-NEXT: retl %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> <float -0., float -0., float -0., float -0.>) ret <4 x float> %r @@ -1155,13 +1150,14 @@ define <4 x float> @test_fmaximumnum_vector_signed_zero(<4 x float> %x) { define <2 x double> @test_fminimumnum_vector_partially_zero(<2 x double> %x) { ; SSE2-LABEL: test_fminimumnum_vector_partially_zero: ; SSE2: # %bb.0: -; SSE2-NEXT: movapd %xmm0, %xmm1 -; SSE2-NEXT: cmpordpd %xmm0, %xmm1 -; SSE2-NEXT: xorpd %xmm2, %xmm2 -; SSE2-NEXT: movhpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; SSE2-NEXT: xorpd %xmm1, %xmm1 +; SSE2-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; SSE2-NEXT: movapd %xmm1, %xmm2 ; SSE2-NEXT: minpd %xmm0, %xmm2 -; SSE2-NEXT: andpd %xmm1, %xmm0 -; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: movapd %xmm2, %xmm0 +; SSE2-NEXT: cmpunordpd %xmm2, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm1 +; SSE2-NEXT: andnpd %xmm2, %xmm0 ; SSE2-NEXT: orpd %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -1169,9 +1165,9 @@ define <2 x double> @test_fminimumnum_vector_partially_zero(<2 x double> %x) { ; AVX: # %bb.0: ; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vcmpordpd %xmm0, %xmm0, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX10_2-LABEL: test_fminimumnum_vector_partially_zero: @@ -1185,9 +1181,9 @@ define <2 x double> @test_fminimumnum_vector_partially_zero(<2 x double> %x) { ; X86: # %bb.0: ; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; X86-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; X86-NEXT: vminpd %xmm0, %xmm1, %xmm1 -; X86-NEXT: vcmpordpd %xmm0, %xmm0, %xmm2 -; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-NEXT: vminpd %xmm0, %xmm1, %xmm0 +; X86-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm2 +; X86-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 ; X86-NEXT: retl %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double 5.>) ret <2 x double> %r @@ -1212,9 +1208,9 @@ define <2 x double> @test_fminimumnum_vector_different_zeros(<2 x double> %x) { ; SSE2-NEXT: por %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm1 ; SSE2-NEXT: minpd %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: cmpordpd %xmm3, %xmm0 -; SSE2-NEXT: andpd %xmm0, %xmm3 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: cmpunordpd %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: andnpd %xmm1, %xmm0 ; SSE2-NEXT: orpd %xmm3, %xmm0 ; SSE2-NEXT: retq @@ -1226,7 +1222,7 @@ define <2 x double> @test_fminimumnum_vector_different_zeros(<2 x double> %x) { ; AVX-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm2 ; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vminpd %xmm2, %xmm0, %xmm1 -; AVX-NEXT: vcmpordpd %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vcmpunordpd %xmm1, %xmm1, %xmm2 ; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; @@ -1244,7 +1240,7 @@ define <2 x double> @test_fminimumnum_vector_different_zeros(<2 x double> %x) { ; X86-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm2 ; X86-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0 ; X86-NEXT: vminpd %xmm2, %xmm0, %xmm1 -; X86-NEXT: vcmpordpd %xmm0, %xmm0, %xmm2 +; X86-NEXT: vcmpunordpd %xmm1, %xmm1, %xmm2 ; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: retl %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double -0.>) @@ -1278,20 +1274,24 @@ define <4 x float> @test_fmaximumnum_vector_non_zero(<4 x float> %x) { define <2 x double> @test_fminimumnum_vector_nan(<2 x double> %x) { ; SSE2-LABEL: test_fminimumnum_vector_nan: ; SSE2: # %bb.0: -; SSE2-NEXT: xorpd %xmm2, %xmm2 ; SSE2-NEXT: xorpd %xmm1, %xmm1 ; SSE2-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; SSE2-NEXT: minpd %xmm0, %xmm1 -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: movapd %xmm1, %xmm2 +; SSE2-NEXT: minpd %xmm0, %xmm2 +; SSE2-NEXT: movapd %xmm2, %xmm0 +; SSE2-NEXT: cmpunordpd %xmm2, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm1 +; SSE2-NEXT: andnpd %xmm2, %xmm0 +; SSE2-NEXT: orpd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_fminimumnum_vector_nan: ; AVX: # %bb.0: ; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vmovhpd {{.*#+}} xmm2 = xmm1[0],mem[0] -; AVX-NEXT: vminpd %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; AVX-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX10_2-LABEL: test_fminimumnum_vector_nan: @@ -1306,7 +1306,7 @@ define <2 x double> @test_fminimumnum_vector_nan(<2 x double> %x) { ; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; X86-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; X86-NEXT: vminpd %xmm0, %xmm1, %xmm0 -; X86-NEXT: vcmpordpd %xmm1, %xmm1, %xmm2 +; X86-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm2 ; X86-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 ; X86-NEXT: retl %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double 0x7fff000000000000>) @@ -1318,19 +1318,17 @@ define <2 x double> @test_fminimumnum_vector_zero_first(<2 x double> %x) { ; SSE2: # %bb.0: ; SSE2-NEXT: xorpd %xmm1, %xmm1 ; SSE2-NEXT: minpd %xmm0, %xmm1 -; SSE2-NEXT: movapd %xmm0, %xmm2 -; SSE2-NEXT: cmpordpd %xmm0, %xmm2 -; SSE2-NEXT: andpd %xmm2, %xmm0 -; SSE2-NEXT: andnpd %xmm1, %xmm2 -; SSE2-NEXT: orpd %xmm2, %xmm0 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: cmpunordpd %xmm1, %xmm0 +; SSE2-NEXT: andnpd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_fminimumnum_vector_zero_first: ; AVX: # %bb.0: ; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vcmpordpd %xmm0, %xmm0, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vandnpd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; ; AVX10_2-LABEL: test_fminimumnum_vector_zero_first: @@ -1342,9 +1340,9 @@ define <2 x double> @test_fminimumnum_vector_zero_first(<2 x double> %x) { ; X86-LABEL: test_fminimumnum_vector_zero_first: ; X86: # %bb.0: ; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; X86-NEXT: vminpd %xmm0, %xmm1, %xmm1 -; X86-NEXT: vcmpordpd %xmm0, %xmm0, %xmm2 -; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-NEXT: vminpd %xmm0, %xmm1, %xmm0 +; X86-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm1 +; X86-NEXT: vandnpd %xmm0, %xmm1, %xmm0 ; X86-NEXT: retl %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> <double 0., double 0.>, <2 x double> %x) ret <2 x double> %r @@ -1378,20 +1376,21 @@ define <4 x float> @test_fmaximumnum_vector_signed_zero_first(<4 x float> %x) { ; SSE2-LABEL: test_fmaximumnum_vector_signed_zero_first: ; SSE2: # %bb.0: ; SSE2-NEXT: movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; SSE2-NEXT: maxps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: cmpordps %xmm0, %xmm2 -; SSE2-NEXT: andps %xmm2, %xmm0 -; SSE2-NEXT: andnps %xmm1, %xmm2 -; SSE2-NEXT: orps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: maxps %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: cmpunordps %xmm2, %xmm0 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: andnps %xmm2, %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_fmaximumnum_vector_signed_zero_first: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vcmpordps %xmm0, %xmm0, %xmm2 -; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vcmpunordps %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX10_2-LABEL: test_fmaximumnum_vector_signed_zero_first: @@ -1402,9 +1401,9 @@ define <4 x float> @test_fmaximumnum_vector_signed_zero_first(<4 x float> %x) { ; X86-LABEL: test_fmaximumnum_vector_signed_zero_first: ; X86: # %bb.0: ; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; X86-NEXT: vmaxps %xmm0, %xmm1, %xmm1 -; X86-NEXT: vcmpordps %xmm0, %xmm0, %xmm2 -; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; X86-NEXT: vmaxps %xmm0, %xmm1, %xmm0 +; X86-NEXT: vcmpunordps %xmm0, %xmm0, %xmm2 +; X86-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ; X86-NEXT: retl %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> <float -0., float -0., float -0., float -0.>, <4 x float> %x) ret <4 x float> %r @@ -1455,11 +1454,11 @@ define <4 x float> @test_fmaximumnum_v4f32_splat(<4 x float> %x, float %y) { ; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: maxps %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: cmpordps %xmm0, %xmm2 -; SSE2-NEXT: andps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: cmpunordps %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: andnps %xmm1, %xmm2 -; SSE2-NEXT: orps %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: test_fmaximumnum_v4f32_splat: @@ -1468,7 +1467,7 @@ define <4 x float> @test_fmaximumnum_v4f32_splat(<4 x float> %x, float %y) { ; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vblendvps %xmm0, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmaxps %xmm2, %xmm0, %xmm1 -; AVX1-NEXT: vcmpordps %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vcmpunordps %xmm1, %xmm1, %xmm2 ; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -1478,7 +1477,7 @@ define <4 x float> @test_fmaximumnum_v4f32_splat(<4 x float> %x, float %y) { ; AVX512-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm2 ; AVX512-NEXT: vblendvps %xmm0, %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmaxps %xmm2, %xmm0, %xmm1 -; AVX512-NEXT: vcmpordps %xmm0, %xmm0, %xmm2 +; AVX512-NEXT: vcmpunordps %xmm1, %xmm1, %xmm2 ; AVX512-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq ; @@ -1494,7 +1493,7 @@ define <4 x float> @test_fmaximumnum_v4f32_splat(<4 x float> %x, float %y) { ; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm2 ; X86-NEXT: vblendvps %xmm0, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmaxps %xmm2, %xmm0, %xmm1 -; X86-NEXT: vcmpordps %xmm0, %xmm0, %xmm2 +; X86-NEXT: vcmpunordps %xmm1, %xmm1, %xmm2 ; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: retl %splatinsert = insertelement <4 x float> poison, float %y, i64 0 @@ -1506,134 +1505,130 @@ define <4 x float> @test_fmaximumnum_v4f32_splat(<4 x float> %x, float %y) { define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind { ; SSE2-LABEL: test_fmaximumnum_v4f16: ; SSE2: # %bb.0: -; SSE2-NEXT: subq $104, %rsp -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: subq $136, %rsp +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] +; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] +; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: psrld $16, %xmm0 ; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: psrld $16, %xmm0 ; SSE2-NEXT: callq __extendhfsf2@PLT -; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: callq __extendhfsf2@PLT -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload -; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero -; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: js .LBB33_2 -; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: .LBB33_2: -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: cmpordss %xmm2, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm3 -; SSE2-NEXT: andps %xmm2, %xmm3 -; SSE2-NEXT: js .LBB33_4 -; SSE2-NEXT: # %bb.3: -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: .LBB33_4: -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: maxss %xmm4, %xmm2 -; SSE2-NEXT: andnps %xmm2, %xmm0 -; SSE2-NEXT: orps %xmm3, %xmm0 +; SSE2-NEXT: js .LBB33_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: jmp .LBB33_3 +; SSE2-NEXT: .LBB33_1: +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: .LBB33_3: +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE2-NEXT: psrlq $48, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: psrlq $48, %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: maxss %xmm1, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: cmpunordss %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: andps %xmm3, %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 ; SSE2-NEXT: callq __truncsfhf2@PLT ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: callq __extendhfsf2@PLT -; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: callq __extendhfsf2@PLT -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload -; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero -; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: js .LBB33_6 +; SSE2-NEXT: js .LBB33_4 ; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: .LBB33_6: -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: cmpordss %xmm2, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm3 -; SSE2-NEXT: andps %xmm2, %xmm3 -; SSE2-NEXT: js .LBB33_8 -; SSE2-NEXT: # %bb.7: -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: .LBB33_8: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: jmp .LBB33_6 +; SSE2-NEXT: .LBB33_4: ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE2-NEXT: psrlq $48, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: psrlq $48, %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill -; SSE2-NEXT: maxss %xmm4, %xmm2 -; SSE2-NEXT: andnps %xmm2, %xmm0 -; SSE2-NEXT: orps %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: .LBB33_6: +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: maxss %xmm1, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: cmpunordss %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: andps %xmm3, %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 ; SSE2-NEXT: callq __truncsfhf2@PLT ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: callq __extendhfsf2@PLT -; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: callq __extendhfsf2@PLT -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload -; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: js .LBB33_10 -; SSE2-NEXT: # %bb.9: -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: .LBB33_10: -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: cmpordss %xmm2, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm3 -; SSE2-NEXT: andps %xmm2, %xmm3 -; SSE2-NEXT: js .LBB33_12 -; SSE2-NEXT: # %bb.11: -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: .LBB33_12: -; SSE2-NEXT: maxss %xmm4, %xmm2 +; SSE2-NEXT: js .LBB33_7 +; SSE2-NEXT: # %bb.8: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: jmp .LBB33_9 +; SSE2-NEXT: .LBB33_7: +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: .LBB33_9: +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: maxss %xmm1, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: cmpunordss %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: andnps %xmm2, %xmm1 -; SSE2-NEXT: orps %xmm3, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: andps %xmm3, %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 ; SSE2-NEXT: callq __truncsfhf2@PLT ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE2-NEXT: callq __extendhfsf2@PLT -; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: callq __extendhfsf2@PLT -; SSE2-NEXT: movd (%rsp), %xmm4 # 4-byte Folded Reload -; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero -; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: js .LBB33_14 -; SSE2-NEXT: # %bb.13: -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: .LBB33_14: -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: cmpordss %xmm2, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm3 -; SSE2-NEXT: andps %xmm2, %xmm3 -; SSE2-NEXT: js .LBB33_16 -; SSE2-NEXT: # %bb.15: -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: .LBB33_16: -; SSE2-NEXT: maxss %xmm4, %xmm2 -; SSE2-NEXT: andnps %xmm2, %xmm0 -; SSE2-NEXT: orps %xmm3, %xmm0 +; SSE2-NEXT: js .LBB33_10 +; SSE2-NEXT: # %bb.11: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload +; SSE2-NEXT: jmp .LBB33_12 +; SSE2-NEXT: .LBB33_10: +; SSE2-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: .LBB33_12: +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: maxss %xmm1, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: cmpunordss %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: andps %xmm3, %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 ; SSE2-NEXT: callq __truncsfhf2@PLT ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] @@ -1641,7 +1636,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: addq $104, %rsp +; SSE2-NEXT: addq $136, %rsp ; SSE2-NEXT: retq ; ; AVX1-LABEL: test_fmaximumnum_v4f16: @@ -1679,7 +1674,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmaxss %xmm1, %xmm2, %xmm0 -; AVX1-NEXT: vcmpordss %xmm2, %xmm2, %xmm1 +; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1 ; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: callq __truncsfhf2@PLT ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1700,7 +1695,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; AVX1-NEXT: vmovdqa %xmm0, %xmm2 ; AVX1-NEXT: .LBB33_6: ; AVX1-NEXT: vmaxss %xmm1, %xmm2, %xmm0 -; AVX1-NEXT: vcmpordss %xmm2, %xmm2, %xmm1 +; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1 ; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: callq __truncsfhf2@PLT ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1721,7 +1716,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; AVX1-NEXT: vmovdqa %xmm0, %xmm2 ; AVX1-NEXT: .LBB33_9: ; AVX1-NEXT: vmaxss %xmm1, %xmm2, %xmm0 -; AVX1-NEXT: vcmpordss %xmm2, %xmm2, %xmm1 +; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1 ; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: callq __truncsfhf2@PLT ; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -1742,7 +1737,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; AVX1-NEXT: vmovdqa %xmm0, %xmm2 ; AVX1-NEXT: .LBB33_12: ; AVX1-NEXT: vmaxss %xmm1, %xmm2, %xmm0 -; AVX1-NEXT: vcmpordss %xmm2, %xmm2, %xmm1 +; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1 ; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: callq __truncsfhf2@PLT ; AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload @@ -1768,7 +1763,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; AVX512-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} ; AVX512-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} ; AVX512-NEXT: vmaxss %xmm4, %xmm3, %xmm2 -; AVX512-NEXT: vcmpordss %xmm3, %xmm3, %k1 +; AVX512-NEXT: vcmpunordss %xmm2, %xmm2, %k1 ; AVX512-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1} ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; AVX512-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] @@ -1783,7 +1778,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; AVX512-NEXT: vmovss %xmm4, %xmm5, %xmm5 {%k1} ; AVX512-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} ; AVX512-NEXT: vmaxss %xmm5, %xmm4, %xmm3 -; AVX512-NEXT: vcmpordss %xmm4, %xmm4, %k1 +; AVX512-NEXT: vcmpunordss %xmm3, %xmm3, %k1 ; AVX512-NEXT: vmovss %xmm4, %xmm3, %xmm3 {%k1} ; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] @@ -1799,7 +1794,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; AVX512-NEXT: vmovss %xmm4, %xmm5, %xmm5 {%k1} ; AVX512-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} ; AVX512-NEXT: vmaxss %xmm5, %xmm4, %xmm3 -; AVX512-NEXT: vcmpordss %xmm4, %xmm4, %k1 +; AVX512-NEXT: vcmpunordss %xmm3, %xmm3, %k1 ; AVX512-NEXT: vmovss %xmm4, %xmm3, %xmm3 {%k1} ; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 ; AVX512-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0] @@ -1814,7 +1809,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; AVX512-NEXT: vmovss %xmm5, %xmm6, %xmm6 {%k1} ; AVX512-NEXT: vmovss %xmm4, %xmm5, %xmm5 {%k1} ; AVX512-NEXT: vmaxss %xmm6, %xmm5, %xmm4 -; AVX512-NEXT: vcmpordss %xmm5, %xmm5, %k1 +; AVX512-NEXT: vcmpunordss %xmm4, %xmm4, %k1 ; AVX512-NEXT: vmovss %xmm5, %xmm4, %xmm4 {%k1} ; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] @@ -1831,7 +1826,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; AVX512-NEXT: vmovss %xmm4, %xmm5, %xmm5 {%k1} ; AVX512-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} ; AVX512-NEXT: vmaxss %xmm5, %xmm4, %xmm3 -; AVX512-NEXT: vcmpordss %xmm4, %xmm4, %k1 +; AVX512-NEXT: vcmpunordss %xmm3, %xmm3, %k1 ; AVX512-NEXT: vmovss %xmm4, %xmm3, %xmm3 {%k1} ; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] @@ -1846,7 +1841,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; AVX512-NEXT: vmovss %xmm5, %xmm6, %xmm6 {%k1} ; AVX512-NEXT: vmovss %xmm4, %xmm5, %xmm5 {%k1} ; AVX512-NEXT: vmaxss %xmm6, %xmm5, %xmm4 -; AVX512-NEXT: vcmpordss %xmm5, %xmm5, %k1 +; AVX512-NEXT: vcmpunordss %xmm4, %xmm4, %k1 ; AVX512-NEXT: vmovss %xmm5, %xmm4, %xmm4 {%k1} ; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] @@ -1860,7 +1855,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; AVX512-NEXT: vmovss %xmm5, %xmm6, %xmm6 {%k1} ; AVX512-NEXT: vmovss %xmm4, %xmm5, %xmm5 {%k1} ; AVX512-NEXT: vmaxss %xmm6, %xmm5, %xmm4 -; AVX512-NEXT: vcmpordss %xmm5, %xmm5, %k1 +; AVX512-NEXT: vcmpunordss %xmm4, %xmm4, %k1 ; AVX512-NEXT: vmovss %xmm5, %xmm4, %xmm4 {%k1} ; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 @@ -1875,7 +1870,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; AVX512-NEXT: vmovss %xmm1, %xmm5, %xmm5 {%k1} ; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vmaxss %xmm5, %xmm1, %xmm0 -; AVX512-NEXT: vcmpordss %xmm1, %xmm1, %k1 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] @@ -1933,7 +1928,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; X86-NEXT: vmovdqa %xmm1, %xmm0 ; X86-NEXT: .LBB33_3: ; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1 -; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: calll __extendhfsf2 @@ -1955,7 +1950,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; X86-NEXT: vmovdqa %xmm1, %xmm0 ; X86-NEXT: .LBB33_6: ; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1 -; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: calll __truncsfhf2 @@ -1993,7 +1988,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; X86-NEXT: vmovdqa %xmm1, %xmm0 ; X86-NEXT: .LBB33_9: ; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1 -; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: calll __extendhfsf2 @@ -2015,7 +2010,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; X86-NEXT: vmovdqa %xmm1, %xmm0 ; X86-NEXT: .LBB33_12: ; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1 -; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: calll __truncsfhf2 @@ -2041,120 +2036,114 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n ; SSE2-NEXT: pushq %rbp ; SSE2-NEXT: pushq %r15 ; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: pushq %r12 ; SSE2-NEXT: pushq %rbx ; SSE2-NEXT: subq $56, %rsp -; SSE2-NEXT: pextrw $0, %xmm1, %r14d -; SSE2-NEXT: pextrw $0, %xmm0, %r15d -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrld $16, %xmm2 -; SSE2-NEXT: pextrw $0, %xmm2, %eax ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrld $16, %xmm2 -; SSE2-NEXT: pextrw $0, %xmm2, %ecx +; SSE2-NEXT: psrlq $48, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: psrlq $48, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[1,1] +; SSE2-NEXT: pextrw $0, %xmm4, %ebp +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] +; SSE2-NEXT: pextrw $0, %xmm4, %r15d +; SSE2-NEXT: pextrw $0, %xmm0, %r12d +; SSE2-NEXT: pextrw $0, %xmm1, %r13d +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: pextrw $0, %xmm0, %eax +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pextrw $0, %xmm1, %ecx ; SSE2-NEXT: shll $16, %ecx -; SSE2-NEXT: movd %ecx, %xmm3 +; SSE2-NEXT: movd %ecx, %xmm1 ; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: testl %ecx, %ecx -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: js .LBB34_2 -; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: .LBB34_2: -; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] -; SSE2-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm0[1,1] -; SSE2-NEXT: movdqa %xmm7, %xmm0 -; SSE2-NEXT: cmpordss %xmm7, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm4 -; SSE2-NEXT: andps %xmm7, %xmm4 -; SSE2-NEXT: js .LBB34_4 -; SSE2-NEXT: # %bb.3: -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: .LBB34_4: -; SSE2-NEXT: pextrw $0, %xmm5, %ebp -; SSE2-NEXT: pextrw $0, %xmm6, %ebx -; SSE2-NEXT: maxss %xmm2, %xmm7 -; SSE2-NEXT: andnps %xmm7, %xmm0 -; SSE2-NEXT: orps %xmm4, %xmm0 +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: js .LBB34_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: jmp .LBB34_3 +; SSE2-NEXT: .LBB34_1: +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: .LBB34_3: +; SSE2-NEXT: pextrw $0, %xmm2, %ebx +; SSE2-NEXT: pextrw $0, %xmm3, %r14d +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: maxss %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: cmpunordss %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm1, %xmm0 +; SSE2-NEXT: orps %xmm3, %xmm0 ; SSE2-NEXT: callq __truncsfbf2@PLT ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: shll $16, %r15d -; SSE2-NEXT: movd %r15d, %xmm3 -; SSE2-NEXT: shll $16, %r14d -; SSE2-NEXT: movd %r14d, %xmm2 -; SSE2-NEXT: testl %r15d, %r15d -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: js .LBB34_6 +; SSE2-NEXT: shll $16, %r13d +; SSE2-NEXT: movd %r13d, %xmm1 +; SSE2-NEXT: shll $16, %r12d +; SSE2-NEXT: movd %r12d, %xmm2 +; SSE2-NEXT: js .LBB34_4 ; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: jmp .LBB34_6 +; SSE2-NEXT: .LBB34_4: +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: .LBB34_6: -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE2-NEXT: psrlq $48, %xmm5 -; SSE2-NEXT: movdqa (%rsp), %xmm6 # 16-byte Reload -; SSE2-NEXT: psrlq $48, %xmm6 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: cmpordss %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm4 -; SSE2-NEXT: andps %xmm1, %xmm4 -; SSE2-NEXT: js .LBB34_8 -; SSE2-NEXT: # %bb.7: -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: .LBB34_8: -; SSE2-NEXT: pextrw $0, %xmm5, %r15d -; SSE2-NEXT: pextrw $0, %xmm6, %r14d -; SSE2-NEXT: maxss %xmm2, %xmm1 -; SSE2-NEXT: andnps %xmm1, %xmm0 -; SSE2-NEXT: orps %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: maxss %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: cmpunordss %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm1, %xmm0 +; SSE2-NEXT: orps %xmm3, %xmm0 ; SSE2-NEXT: callq __truncsfbf2@PLT ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: shll $16, %ebx -; SSE2-NEXT: movd %ebx, %xmm1 +; SSE2-NEXT: shll $16, %r15d +; SSE2-NEXT: movd %r15d, %xmm1 ; SSE2-NEXT: shll $16, %ebp -; SSE2-NEXT: movd %ebp, %xmm3 -; SSE2-NEXT: testl %ebx, %ebx -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: js .LBB34_10 -; SSE2-NEXT: # %bb.9: -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: .LBB34_10: +; SSE2-NEXT: movd %ebp, %xmm2 +; SSE2-NEXT: js .LBB34_7 +; SSE2-NEXT: # %bb.8: ; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: cmpordss %xmm2, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm4 -; SSE2-NEXT: andps %xmm2, %xmm4 -; SSE2-NEXT: js .LBB34_12 -; SSE2-NEXT: # %bb.11: -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: .LBB34_12: -; SSE2-NEXT: maxss %xmm3, %xmm2 -; SSE2-NEXT: andnps %xmm2, %xmm0 -; SSE2-NEXT: orps %xmm4, %xmm0 +; SSE2-NEXT: jmp .LBB34_9 +; SSE2-NEXT: .LBB34_7: +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: .LBB34_9: +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: maxss %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: cmpunordss %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm1, %xmm0 +; SSE2-NEXT: orps %xmm3, %xmm0 ; SSE2-NEXT: callq __truncsfbf2@PLT ; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; SSE2-NEXT: shll $16, %r14d ; SSE2-NEXT: movd %r14d, %xmm1 -; SSE2-NEXT: shll $16, %r15d -; SSE2-NEXT: movd %r15d, %xmm3 -; SSE2-NEXT: testl %r14d, %r14d -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: js .LBB34_14 -; SSE2-NEXT: # %bb.13: -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: .LBB34_14: +; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd %ebx, %xmm2 +; SSE2-NEXT: js .LBB34_10 +; SSE2-NEXT: # %bb.11: ; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: cmpordss %xmm2, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm4 -; SSE2-NEXT: andps %xmm2, %xmm4 -; SSE2-NEXT: js .LBB34_16 -; SSE2-NEXT: # %bb.15: -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: .LBB34_16: -; SSE2-NEXT: maxss %xmm3, %xmm2 -; SSE2-NEXT: andnps %xmm2, %xmm0 -; SSE2-NEXT: orps %xmm4, %xmm0 +; SSE2-NEXT: jmp .LBB34_12 +; SSE2-NEXT: .LBB34_10: +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: .LBB34_12: +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: maxss %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: cmpunordss %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm1, %xmm0 +; SSE2-NEXT: orps %xmm3, %xmm0 ; SSE2-NEXT: callq __truncsfbf2@PLT ; SSE2-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] @@ -2164,6 +2153,8 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: addq $56, %rsp ; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 ; SSE2-NEXT: popq %r14 ; SSE2-NEXT: popq %r15 ; SSE2-NEXT: popq %rbp @@ -2205,7 +2196,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n ; AVX1-NEXT: vpextrw $0, %xmm2, %ebp ; AVX1-NEXT: vpextrw $0, %xmm3, %r15d ; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: callq __truncsfbf2@PLT ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2222,7 +2213,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n ; AVX1-NEXT: vmovdqa %xmm2, %xmm0 ; AVX1-NEXT: .LBB34_6: ; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: callq __truncsfbf2@PLT ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2239,7 +2230,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n ; AVX1-NEXT: vmovdqa %xmm2, %xmm0 ; AVX1-NEXT: .LBB34_9: ; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: callq __truncsfbf2@PLT ; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -2256,7 +2247,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n ; AVX1-NEXT: vmovdqa %xmm2, %xmm0 ; AVX1-NEXT: .LBB34_12: ; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: callq __truncsfbf2@PLT ; AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload @@ -2305,7 +2296,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n ; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} ; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0 -; AVX512-NEXT: vcmpordss %xmm1, %xmm1, %k1 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: callq __truncsfbf2@PLT ; AVX512-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) @@ -2319,7 +2310,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n ; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} ; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0 -; AVX512-NEXT: vcmpordss %xmm1, %xmm1, %k1 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: callq __truncsfbf2@PLT ; AVX512-NEXT: vpextrw $0, %xmm0, (%rsp) @@ -2333,7 +2324,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n ; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} ; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0 -; AVX512-NEXT: vcmpordss %xmm1, %xmm1, %k1 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: callq __truncsfbf2@PLT ; AVX512-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) @@ -2347,7 +2338,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n ; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} ; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0 -; AVX512-NEXT: vcmpordss %xmm1, %xmm1, %k1 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: callq __truncsfbf2@PLT ; AVX512-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) @@ -2400,7 +2391,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n ; X86-NEXT: vpextrw $0, %xmm2, %edi ; X86-NEXT: vpextrw $0, %xmm3, %ebp ; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1 -; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: shll $16, %ecx @@ -2416,7 +2407,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n ; X86-NEXT: vmovdqa %xmm2, %xmm0 ; X86-NEXT: .LBB34_6: ; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1 -; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: calll __truncsfbf2 @@ -2436,7 +2427,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n ; X86-NEXT: vmovdqa %xmm2, %xmm0 ; X86-NEXT: .LBB34_9: ; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1 -; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: calll __truncsfbf2 @@ -2456,7 +2447,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n ; X86-NEXT: vmovdqa %xmm2, %xmm0 ; X86-NEXT: .LBB34_12: ; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1 -; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: calll __truncsfbf2 diff --git a/llvm/test/CodeGen/X86/fp-undef.ll b/llvm/test/CodeGen/X86/fp-undef.ll index 227f007..c358085 100644 --- a/llvm/test/CodeGen/X86/fp-undef.ll +++ b/llvm/test/CodeGen/X86/fp-undef.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ANY -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -enable-unsafe-fp-math | FileCheck %s --check-prefix=ANY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ANY ; This is duplicated from tests for InstSimplify. If you're ; adding something here, you should probably add it there too. diff --git a/llvm/test/CodeGen/X86/fp128-select.ll b/llvm/test/CodeGen/X86/fp128-select.ll index 659e4dd..27a651e 100644 --- a/llvm/test/CodeGen/X86/fp128-select.ll +++ b/llvm/test/CodeGen/X86/fp128-select.ll @@ -13,8 +13,8 @@ define void @test_select(ptr %p, ptr %q, i1 zeroext %c) nounwind { ; SSE: # %bb.0: ; SSE-NEXT: testl %edx, %edx ; SSE-NEXT: jne .LBB0_1 -; SSE-NEXT: # %bb.3: -; SSE-NEXT: movaps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: # %bb.2: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [NaN] ; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: retq ; SSE-NEXT: .LBB0_1: @@ -58,7 +58,7 @@ define fp128 @test_select_cc(fp128, fp128) nounwind { ; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: jmp .LBB1_3 ; SSE-NEXT: .LBB1_1: -; SSE-NEXT: movaps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0] ; SSE-NEXT: .LBB1_3: # %BB0 ; SSE-NEXT: testl %ebx, %ebx ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload diff --git a/llvm/test/CodeGen/X86/fsxor-alignment.ll b/llvm/test/CodeGen/X86/fsxor-alignment.ll index 6fa4a31..32af5b9 100644 --- a/llvm/test/CodeGen/X86/fsxor-alignment.ll +++ b/llvm/test/CodeGen/X86/fsxor-alignment.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 -enable-unsafe-fp-math | FileCheck %s +; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 | FileCheck %s ; Don't fold the incoming stack arguments into the xorps instructions used ; to do floating-point negations, because the arguments aren't vectors diff --git a/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll b/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll index f710a30..bd997d1 100644 --- a/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll +++ b/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-unsafe-fp-math < %s | FileCheck %s +; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse < %s | FileCheck %s ; The debug info in this test case was causing a crash because machine trace metrics ; did not correctly ignore debug instructions. The check lines ensure that the diff --git a/llvm/test/CodeGen/X86/neg_fp.ll b/llvm/test/CodeGen/X86/neg_fp.ll index 8020982..18ded50 100644 --- a/llvm/test/CodeGen/X86/neg_fp.ll +++ b/llvm/test/CodeGen/X86/neg_fp.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-- -mattr=+sse4.1 | FileCheck %s -; Test that when we don't -enable-unsafe-fp-math, we don't do the optimization +; Test that when we don't, we don't do the optimization ; -0 - (A - B) to (B - A) because A==B, -0 != 0 define float @negfp(float %a, float %b) nounwind { diff --git a/llvm/test/CodeGen/X86/negate-add-zero.ll b/llvm/test/CodeGen/X86/negate-add-zero.ll index eb4e2d3..4884832 100644 --- a/llvm/test/CodeGen/X86/negate-add-zero.ll +++ b/llvm/test/CodeGen/X86/negate-add-zero.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -enable-unsafe-fp-math | FileCheck %s +; RUN: llc < %s | FileCheck %s ; PR3374 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" diff --git a/llvm/test/CodeGen/X86/recip-pic.ll b/llvm/test/CodeGen/X86/recip-pic.ll index d01ecc1..d2620e7 100644 --- a/llvm/test/CodeGen/X86/recip-pic.ll +++ b/llvm/test/CodeGen/X86/recip-pic.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -enable-unsafe-fp-math -mcpu=slm -relocation-model=pic | FileCheck %s --check-prefix=CHECK +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=slm -relocation-model=pic | FileCheck %s --check-prefix=CHECK define fastcc float @foo(float %x) unnamed_addr #0 { ; CHECK-LABEL: foo: diff --git a/llvm/test/CodeGen/X86/sincos-opt.ll b/llvm/test/CodeGen/X86/sincos-opt.ll index 6885456..51f3e52 100644 --- a/llvm/test/CodeGen/X86/sincos-opt.ll +++ b/llvm/test/CodeGen/X86/sincos-opt.ll @@ -1,10 +1,10 @@ ; RUN: llc < %s -mtriple=x86_64-apple-macosx10.9.0 -mcpu=core2 | FileCheck %s --check-prefix=OSX_SINCOS ; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=core2 | FileCheck %s --check-prefix=OSX_NOOPT ; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mcpu=core2 | FileCheck %s --check-prefix=GNU_SINCOS -; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mcpu=core2 -enable-unsafe-fp-math | FileCheck %s --check-prefix=GNU_SINCOS_FASTMATH -; RUN: llc < %s -mtriple=x86_64-pc-linux-gnux32 -mcpu=core2 -enable-unsafe-fp-math | FileCheck %s --check-prefix=GNU_SINCOS_FASTMATH +; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mcpu=core2 | FileCheck %s --check-prefix=GNU_SINCOS_FASTMATH +; RUN: llc < %s -mtriple=x86_64-pc-linux-gnux32 -mcpu=core2 | FileCheck %s --check-prefix=GNU_SINCOS_FASTMATH ; RUN: llc < %s -mtriple=x86_64-fuchsia -mcpu=core2 | FileCheck %s --check-prefix=GNU_SINCOS -; RUN: llc < %s -mtriple=x86_64-fuchsia -mcpu=core2 -enable-unsafe-fp-math | FileCheck %s --check-prefix=GNU_SINCOS_FASTMATH +; RUN: llc < %s -mtriple=x86_64-fuchsia -mcpu=core2 | FileCheck %s --check-prefix=GNU_SINCOS_FASTMATH ; RUN: llc < %s -mtriple=x86_64-scei-ps4 -mcpu=btver2 | FileCheck %s --check-prefix=PS4_SINCOS ; RUN: llc < %s -mtriple=x86_64-sie-ps5 -mcpu=znver2 | FileCheck %s --check-prefix=PS4_SINCOS diff --git a/llvm/test/CodeGen/X86/sincos.ll b/llvm/test/CodeGen/X86/sincos.ll index 7903407..9206c25 100644 --- a/llvm/test/CodeGen/X86/sincos.ll +++ b/llvm/test/CodeGen/X86/sincos.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; Make sure this testcase codegens to the sin and cos instructions, not calls -; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math | FileCheck %s +; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 | FileCheck %s ; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 | FileCheck %s declare float @sinf(float) readonly diff --git a/llvm/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll b/llvm/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll index c0beb6f..2822d40 100644 --- a/llvm/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll +++ b/llvm/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll @@ -1,9 +1,9 @@ -; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math | FileCheck %s --check-prefix=CST --check-prefix=SSE2 -; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+sse4.1 | FileCheck %s --check-prefix=CST --check-prefix=SSE41 -; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx | FileCheck %s --check-prefix=CST --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F -; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512VL +; RUN: llc < %s -mtriple=x86_64 | FileCheck %s --check-prefix=CST --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64 -mattr=+sse4.1 | FileCheck %s --check-prefix=CST --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64 -mattr=+avx | FileCheck %s --check-prefix=CST --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F +; RUN: llc < %s -mtriple=x86_64 -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512VL ; Check that the constant used in the vectors are the right ones. ; SSE2: [[MASKCSTADDR:.LCPI[0-9_]+]]: diff --git a/llvm/test/DebugInfo/X86/aggressive-instcombine-store-merge-dbg.ll b/llvm/test/DebugInfo/X86/aggressive-instcombine-store-merge-dbg.ll new file mode 100644 index 0000000..f6e941a --- /dev/null +++ b/llvm/test/DebugInfo/X86/aggressive-instcombine-store-merge-dbg.ll @@ -0,0 +1,49 @@ +; RUN: opt -S -passes=aggressive-instcombine -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +;; Aggressive instcombine merges the two i8 stores into an i16 store. Check +;; the debug location and DIAssignID metadata get merged. + +; CHECK: define void @test_i16(i16 %x, ptr %p) !dbg ![[#]] { +; CHECK-NEXT: store i16 %x, ptr %p, align 1, !dbg ![[DBG:[0-9]+]], !DIAssignID ![[ID:[0-9]+]] +; CHECK-NEXT: #dbg_assign(i16 %x, ![[#]], +; CHECK-SAME: !DIExpression(DW_OP_LLVM_convert, 16, DW_ATE_unsigned, DW_OP_LLVM_convert, 8, DW_ATE_unsigned, DW_OP_stack_value, DW_OP_LLVM_fragment, 0, 8), +; CHECK-SAME: ![[ID]], ptr %p, !DIExpression(), ![[#]]) +; CHECK-NEXT: #dbg_assign(i16 %x, ![[#]], +; CHECK-SAME: !DIExpression(DW_OP_constu, 8, DW_OP_shr, DW_OP_LLVM_convert, 16, DW_ATE_unsigned, DW_OP_LLVM_convert, 8, DW_ATE_unsigned, DW_OP_stack_value, DW_OP_LLVM_fragment, 8, 8), +; CHECK-SAME: ![[ID]], ptr %p, !DIExpression(DW_OP_plus_uconst, 1), ![[#]]) +; CHECK-NEXT: ret void + +; CHECK: ![[DBG]] = !DILocation(line: 0, scope: ![[#]]) + +define void @test_i16(i16 %x, ptr %p) !dbg !5 { + %x.0 = trunc i16 %x to i8 + store i8 %x.0, ptr %p, align 1, !dbg !16, !DIAssignID !17 + #dbg_assign(i8 %x.0, !9, !DIExpression(DW_OP_LLVM_fragment, 0, 8), !17, ptr %p, !DIExpression(), !18) + %shr.1 = lshr i16 %x, 8 + %x.1 = trunc i16 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1, align 1, !dbg !19, !DIAssignID !20 + #dbg_assign(i8 %x.1, !9, !DIExpression(DW_OP_LLVM_fragment, 8, 8), !20, ptr %gep.1, !DIExpression(), !18) + ret void +} + +!llvm.dbg.cu = !{!0} +!llvm.debugify = !{!2, !3} +!llvm.module.flags = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!1 = !DIFile(filename: "/app/example.ll", directory: "/") +!2 = !{i32 7} +!3 = !{i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = distinct !DISubprogram(name: "test_i16", linkageName: "test_i16", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8) +!6 = !DISubroutineType(types: !7) +!7 = !{} +!8 = !{!9} +!9 = !DILocalVariable(name: "1", scope: !5, file: !1, line: 1, type: !10) +!10 = !DIBasicType(name: "ty16", size: 16, encoding: DW_ATE_unsigned) +!16 = !DILocation(line: 2, column: 1, scope: !5) +!17 = distinct !DIAssignID() +!18 = !DILocation(line: 1, column: 1, scope: !5) +!19 = !DILocation(line: 6, column: 1, scope: !5) +!20 = distinct !DIAssignID() diff --git a/llvm/test/Instrumentation/AllocToken/basic.ll b/llvm/test/Instrumentation/AllocToken/basic.ll index 099d37d..0c34b137 100644 --- a/llvm/test/Instrumentation/AllocToken/basic.ll +++ b/llvm/test/Instrumentation/AllocToken/basic.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt < %s -passes=inferattrs,alloc-token -alloc-token-mode=increment -S | FileCheck %s +; RUN: opt < %s -passes='inferattrs,alloc-token<mode=increment>' -S | FileCheck %s target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" diff --git a/llvm/test/Instrumentation/AllocToken/basic32.ll b/llvm/test/Instrumentation/AllocToken/basic32.ll index 944a452..52d1d14 100644 --- a/llvm/test/Instrumentation/AllocToken/basic32.ll +++ b/llvm/test/Instrumentation/AllocToken/basic32.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt < %s -passes=inferattrs,alloc-token -alloc-token-mode=increment -S | FileCheck %s +; RUN: opt < %s -passes='inferattrs,alloc-token<mode=increment>' -S | FileCheck %s target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128" diff --git a/llvm/test/Instrumentation/AllocToken/fast.ll b/llvm/test/Instrumentation/AllocToken/fast.ll index 19a3ef6..f6bf5ee 100644 --- a/llvm/test/Instrumentation/AllocToken/fast.ll +++ b/llvm/test/Instrumentation/AllocToken/fast.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt < %s -passes=inferattrs,alloc-token -alloc-token-mode=increment -alloc-token-fast-abi -alloc-token-max=3 -S | FileCheck %s +; RUN: opt < %s -passes='inferattrs,alloc-token<mode=increment>' -alloc-token-fast-abi -alloc-token-max=3 -S | FileCheck %s target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" diff --git a/llvm/test/Instrumentation/AllocToken/intrinsic.ll b/llvm/test/Instrumentation/AllocToken/intrinsic.ll index 13aaa90..5c6f2f1 100644 --- a/llvm/test/Instrumentation/AllocToken/intrinsic.ll +++ b/llvm/test/Instrumentation/AllocToken/intrinsic.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; Test that the alloc-token pass lowers the intrinsic to a constant token ID. ; -; RUN: opt < %s -passes=alloc-token -alloc-token-mode=typehashpointersplit -alloc-token-max=2 -S | FileCheck %s +; RUN: opt < %s -passes='alloc-token<mode=typehashpointersplit>' -alloc-token-max=2 -S | FileCheck %s target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Instrumentation/AllocToken/intrinsic32.ll b/llvm/test/Instrumentation/AllocToken/intrinsic32.ll index eb5dbbe..15f7c25 100644 --- a/llvm/test/Instrumentation/AllocToken/intrinsic32.ll +++ b/llvm/test/Instrumentation/AllocToken/intrinsic32.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; Test that the alloc-token pass lowers the intrinsic to a constant token ID. ; -; RUN: opt < %s -passes=alloc-token -alloc-token-mode=typehashpointersplit -alloc-token-max=2 -S | FileCheck %s +; RUN: opt < %s -passes='alloc-token<mode=typehashpointersplit>' -alloc-token-max=2 -S | FileCheck %s target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128" target triple = "i386-pc-linux-gnu" diff --git a/llvm/test/Instrumentation/AllocToken/invoke.ll b/llvm/test/Instrumentation/AllocToken/invoke.ll index 347c99a..8e7ab38 100644 --- a/llvm/test/Instrumentation/AllocToken/invoke.ll +++ b/llvm/test/Instrumentation/AllocToken/invoke.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt < %s -passes=inferattrs,alloc-token -alloc-token-mode=increment -S | FileCheck %s +; RUN: opt < %s -passes='inferattrs,alloc-token<mode=increment>' -S | FileCheck %s target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" diff --git a/llvm/test/Instrumentation/AllocToken/nonlibcalls.ll b/llvm/test/Instrumentation/AllocToken/nonlibcalls.ll index 19673da..45f573e 100644 --- a/llvm/test/Instrumentation/AllocToken/nonlibcalls.ll +++ b/llvm/test/Instrumentation/AllocToken/nonlibcalls.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt < %s -passes=inferattrs,alloc-token -alloc-token-mode=increment -alloc-token-extended -S | FileCheck %s +; RUN: opt < %s -passes='inferattrs,alloc-token<mode=increment>' -alloc-token-extended -S | FileCheck %s target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" diff --git a/llvm/test/Instrumentation/AllocToken/typehashpointersplit.ll b/llvm/test/Instrumentation/AllocToken/typehashpointersplit.ll index 1f77648..4d1be5e 100644 --- a/llvm/test/Instrumentation/AllocToken/typehashpointersplit.ll +++ b/llvm/test/Instrumentation/AllocToken/typehashpointersplit.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt < %s -passes=inferattrs,alloc-token -alloc-token-mode=typehashpointersplit -alloc-token-max=2 -S | FileCheck %s +; RUN: opt < %s -passes='inferattrs,alloc-token<mode=typehashpointersplit>' -alloc-token-max=2 -S | FileCheck %s target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll index 1ddcd4b..1c869bd 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -passes=msan -mattr=+sme -o - %s | FileCheck %s +; RUN: opt -S -passes=msan -mattr=+sme -o - %s ; XFAIL: * diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll index 9caa89d..00cf3204 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -passes=msan -mattr=+sme -o - %s | FileCheck %s +; RUN: opt -S -passes=msan -mattr=+sme -o - %s ; XFAIL: * diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add-mini.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add-mini.ll new file mode 100644 index 0000000..3f43efa --- /dev/null +++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add-mini.ll @@ -0,0 +1,16 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -passes=msan -mattr=+sme2 -mattr=+sme-i16i64 -mattr=+sme-f64f64 -o - %s + +; XFAIL: * + +; Forked from llvm/test/CodeGen/AArch64/sme2-intrinsics-add.ll +; Manually reduced to show MSan leads to a compiler crash + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-android9001" + +define void @multi_vector_add_za_vg1x4_f32_tuple(i64 %stride, ptr %ptr) sanitize_memory { + %1 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %2 = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") %1, ptr %ptr) + ret void +} diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add.ll new file mode 100644 index 0000000..cd04373 --- /dev/null +++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add.ll @@ -0,0 +1,340 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -passes=msan -mattr=+sme2 -mattr=+sme-i16i64 -mattr=+sme-f64f64 -o - %s + +; XFAIL: * + +; Forked from llvm/test/CodeGen/AArch64/sme2-intrinsics-add.ll + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-android9001" + +define void @multi_vector_add_write_single_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zm) sanitize_memory { + call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 %slice, + <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, + <vscale x 4 x i32> %zm) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 %slice.7, + <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, + <vscale x 4 x i32> %zm) + ret void +} + +define void @multi_vector_add_write_single_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zm) sanitize_memory { + call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 %slice, + <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, + <vscale x 2 x i64> %zm) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 %slice.7, + <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, + <vscale x 2 x i64> %zm) + ret void +} + + +define void @multi_vector_add_write_single_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, + <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, + <vscale x 4 x i32> %zm) sanitize_memory { + call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 %slice, + <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, + <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, + <vscale x 4 x i32> %zm) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 %slice.7, + <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, + <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, + <vscale x 4 x i32> %zm) + ret void +} + +define void @multi_vector_add_write_single_za_vg1x4_i64(i32 %slice, + <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, + <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, + <vscale x 2 x i64> %zm) sanitize_memory { + call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 %slice, + <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, + <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, + <vscale x 2 x i64> %zm) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 %slice.7, + <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, + <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, + <vscale x 2 x i64> %zm) + ret void +} + + +define void @multi_vector_add_write_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, + <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2) sanitize_memory { + call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 %slice, + <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, + <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 %slice.7, + <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, + <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2) + ret void +} + + +define void @multi_vector_add_write_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, + <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2) sanitize_memory { + call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 %slice, + <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, + <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 %slice.7, + <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, + <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2) + ret void +} + + + +define void @multi_vector_add_write_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, + <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, + <vscale x 4 x i32> %zm0, <vscale x 4 x i32> %zm1, + <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3) sanitize_memory { + call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 %slice, + <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, + <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, + <vscale x 4 x i32> %zm0, <vscale x 4 x i32> %zm1, + <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 %slice.7, + <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, + <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, + <vscale x 4 x i32> %zm0, <vscale x 4 x i32> %zm1, + <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3) + ret void +} + +define void @multi_vector_add_write_za_vg1x4_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, + <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, + <vscale x 2 x i64> %zm0, <vscale x 2 x i64> %zm1, + <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3) sanitize_memory { + call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 %slice, + <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, + <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, + <vscale x 2 x i64> %zm0, <vscale x 2 x i64> %zm1, + <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 %slice.7, + <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, + <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, + <vscale x 2 x i64> %zm0, <vscale x 2 x i64> %zm1, + <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3) + ret void +} + +define void @multi_vector_add_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1) sanitize_memory { + call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 %slice,<vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 %slice.7, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1) + ret void +} + +define void @multi_vector_add_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1) sanitize_memory { + call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 %slice.7, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1) + ret void +} + +define void @multi_vector_add_za_vg1x2_f32(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1) sanitize_memory { + call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32 %slice, + <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32 %slice.7, + <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1) + ret void +} + +define void @multi_vector_add_za_vg1x2_f64(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1) sanitize_memory { + call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 %slice, + <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 %slice.7, + <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1) + ret void +} + +define void @multi_vector_add_za_vg1x2_f64_tuple(i64 %stride, ptr %ptr) sanitize_memory { +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x2.nxv2f64(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %1, 0 + %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %1, 1 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %4 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x2.nxv2f64(target("aarch64.svcount") %0, ptr %arrayidx2) + %5 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %4, 0 + %6 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %4, 1 + call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 0, <vscale x 2 x double> %2, <vscale x 2 x double> %5) + call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 0, <vscale x 2 x double> %3, <vscale x 2 x double> %6) + ret void +} + + +define void @multi_vector_add_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3) sanitize_memory { + call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 %slice, + <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, + <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 %slice.7, + <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, + <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3) + ret void +} + +define void @multi_vector_add_za_vg1x4_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3) sanitize_memory { + call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 %slice, + <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, + <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 %slice.7, + <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, + <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3) + ret void +} + +define void @multi_vector_add_za_vg1x4_f32(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3) sanitize_memory { + call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 %slice, + <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, + <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 %slice.7, + <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, + <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3) + ret void +} + +define void @multi_vector_add_za_vg1x4_f32_tuple(i64 %stride, ptr %ptr) sanitize_memory { +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %1, 0 + %3 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %1, 1 + %4 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %1, 2 + %5 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %1, 3 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %6 = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") %0, ptr %arrayidx2) + %7 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %6, 0 + %8 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %6, 1 + %9 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %6, 2 + %10 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %6, 3 + %mul3 = shl i64 %stride, 1 + %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 + %11 = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") %0, ptr %arrayidx4) + %12 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %11, 0 + %13 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %11, 1 + %14 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %11, 2 + %15 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %11, 3 + %mul5 = mul i64 %stride, 3 + %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 + %16 = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") %0, ptr %arrayidx6) + %17 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %16, 0 + %18 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %16, 1 + %19 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %16, 2 + %20 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %16, 3 + call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 0, <vscale x 4 x float> %2, <vscale x 4 x float> %7, <vscale x 4 x float> %12, <vscale x 4 x float> %17) + call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 0, <vscale x 4 x float> %3, <vscale x 4 x float> %8, <vscale x 4 x float> %13, <vscale x 4 x float> %18) + call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 0, <vscale x 4 x float> %4, <vscale x 4 x float> %9, <vscale x 4 x float> %14, <vscale x 4 x float> %19) + call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 0, <vscale x 4 x float> %5, <vscale x 4 x float> %10, <vscale x 4 x float> %15, <vscale x 4 x float> %20) + ret void +} + +define void @multi_vector_add_za_vg1x4_f64(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3) sanitize_memory { + call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32 %slice, + <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, + <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32 %slice.7, + <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, + <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3) + ret void +} + + +define { <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_add_single_x2_s8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zm) sanitize_memory { + %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } + @llvm.aarch64.sve.add.single.x2.nxv16i8(<vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, + <vscale x 16 x i8> %zm) + ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res +} + +define { <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_add_single_x2_s16(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zm) sanitize_memory { + %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } + @llvm.aarch64.sve.add.single.x2.nxv8i16(<vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, + <vscale x 8 x i16> %zm) + ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res +} + +define { <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_add_single_x2_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zm) sanitize_memory { + %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } + @llvm.aarch64.sve.add.single.x2.nxv4i32(<vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, + <vscale x 4 x i32> %zm) + ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res +} + +define { <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_add_single_x2_s64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zm) sanitize_memory { + %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } + @llvm.aarch64.sve.add.single.x2.nxv2i64(<vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, + <vscale x 2 x i64> %zm) + ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res +} + + +define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_add_single_x4_s8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, <vscale x 16 x i8>%zm) sanitize_memory { + %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } + @llvm.aarch64.sve.add.single.x4.nxv16i8(<vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, + <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, + <vscale x 16 x i8> %zm) + ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res +} + +define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_add_x4_single_s16(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, <vscale x 8 x i16> %zm) sanitize_memory { + %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } + @llvm.aarch64.sve.add.single.x4.nxv8i16(<vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, + <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, + <vscale x 8 x i16> %zm) + ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res +} + +define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_add_x4_single_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, <vscale x 4 x i32> %zm) sanitize_memory { + %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } + @llvm.aarch64.sve.add.single.x4.nxv4i32(<vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, + <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, + <vscale x 4 x i32> %zm) + ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res +} + +define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_add_x4_single_s64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, <vscale x 2 x i64> %zm) sanitize_memory { + %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } + @llvm.aarch64.sve.add.single.x4.nxv2i64(<vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, + <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, + <vscale x 2 x i64> %zm) + ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res +} +declare void@llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>) +declare void@llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>) +declare void@llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>) +declare void@llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>) +declare void@llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>) +declare void@llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>) +declare void@llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>) +declare void@llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>) +declare void@llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32, <vscale x 4 x i32>,<vscale x 4 x i32>) +declare void@llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32, <vscale x 2 x i64>,<vscale x 2 x i64>) +declare void@llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32, <vscale x 4 x i32>,<vscale x 4 x i32>,<vscale x 4 x i32>,<vscale x 4 x i32>) +declare void@llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32, <vscale x 2 x i64>,<vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>) +declare void@llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>) +declare void@llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>) +declare void@llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>) +declare void@llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>) +declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.add.single.x2.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) +declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.add.single.x2.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) +declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.add.single.x2.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>) +declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.add.single.x2.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>) +declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.add.single.x4.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) +declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.add.single.x4.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) +declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.add.single.x4.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>) +declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.add.single.x4.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>) diff --git a/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll b/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll index 434ac84..3d759f7 100644 --- a/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll +++ b/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll @@ -865,33 +865,6 @@ entry: ret float %r } -; Note that the `unsafe-fp-math` from the function attributes should be moved to -; individual instructions, with the shadow instructions NOT getting the attribute. -define float @param_add_return_float_unsafe_fp_math(float %a) #0 { -; CHECK-LABEL: @param_add_return_float_unsafe_fp_math( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__nsan_shadow_args_tag, align 8 -; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[TMP0]], ptrtoint (ptr @param_add_return_float_unsafe_fp_math to i64) -; CHECK-NEXT: [[TMP2:%.*]] = load double, ptr @__nsan_shadow_args_ptr, align 1 -; CHECK-NEXT: [[TMP3:%.*]] = fpext float [[A:%.*]] to double -; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP1]], double [[TMP2]], double [[TMP3]] -; CHECK-NEXT: store i64 0, ptr @__nsan_shadow_args_tag, align 8 -; CHECK-NEXT: [[B:%.*]] = fadd fast float [[A]], 1.000000e+00 -; CHECK-NEXT: [[TMP5:%.*]] = fadd double [[TMP4]], 1.000000e+00 -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @__nsan_internal_check_float_d(float [[B]], double [[TMP5]], i32 1, i64 0) -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 1 -; CHECK-NEXT: [[TMP8:%.*]] = fpext float [[B]] to double -; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP7]], double [[TMP8]], double [[TMP5]] -; CHECK-NEXT: store i64 ptrtoint (ptr @param_add_return_float_unsafe_fp_math to i64), ptr @__nsan_shadow_ret_tag, align 8 -; CHECK-NEXT: store double [[TMP9]], ptr @__nsan_shadow_ret_ptr, align 8 -; CHECK-NEXT: ret float [[B]] -; -entry: - %b = fadd float %a, 1.0 - ret float %b -} - - define void @truncate(<2 x double> %0) sanitize_numerical_stability { ; DQQ-LABEL: @truncate( ; DQQ-NEXT: entry: @@ -941,4 +914,4 @@ entry: } -attributes #0 = { nounwind readonly uwtable sanitize_numerical_stability "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #0 = { nounwind readonly uwtable sanitize_numerical_stability "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" } diff --git a/llvm/test/LTO/AArch64/Inputs/bar.ll b/llvm/test/LTO/AArch64/Inputs/bar.ll new file mode 100644 index 0000000..7c2a753 --- /dev/null +++ b/llvm/test/LTO/AArch64/Inputs/bar.ll @@ -0,0 +1,35 @@ +;; This file contains the new semantic of the branch-target-enforcement, sign-return-address. +;; Used for test mixing a mixed link case and also verify the import too in llc. + +; RUN: llc -mattr=+pauth -mattr=+bti %s -o - | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +define dso_local void @bar() #0 { +entry: + ret void +} +; CHECK-LABEL: bar: +; CHECK-NOT: hint +; CHECK-NOT: bti +; CHECK: ret + +define dso_local void @baz() #1 { +entry: + ret void +} + +; CHECK-LABEL: baz: +; CHECK: bti c +; CHECK: ret + +attributes #0 = { noinline nounwind optnone uwtable } +attributes #1 = { noinline nounwind optnone uwtable "branch-target-enforcement" } + +!llvm.module.flags = !{!0, !1, !2, !3} + +!0 = !{i32 8, !"branch-target-enforcement", i32 2} +!1 = !{i32 8, !"sign-return-address", i32 2} +!2 = !{i32 8, !"sign-return-address-all", i32 2} +!3 = !{i32 8, !"sign-return-address-with-bkey", i32 2} diff --git a/llvm/test/LTO/AArch64/Inputs/fiz.ll b/llvm/test/LTO/AArch64/Inputs/fiz.ll new file mode 100644 index 0000000..e578426 --- /dev/null +++ b/llvm/test/LTO/AArch64/Inputs/fiz.ll @@ -0,0 +1,41 @@ +;; This file contains the previous semantic of the branch-target-enforcement, sign-return-address. +;; Used for test mixing a mixed link case and also verify the import too in llc. + +; RUN: llc -mattr=+pauth -mattr=+bti %s -o - | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +declare void @func() + +define i32 @fiz_on() #0 { +entry: + call void @func() + ret i32 42 +} + +; CHECK-LABEL: fiz_on: +; CHECK: paciasp +; CHECK: bl func +; CHECK: retaa + +define i32 @fiz_off() #1 { +entry: + ret i32 43 +} + +; CHECK-LABEL: fiz_off: +; CHECK-NOT: pac +; CHECK-NOT: hint +; CHECK-NOT: bti +; CHECK: ret + +attributes #0 = { noinline nounwind optnone uwtable } +attributes #1 = { noinline nounwind optnone uwtable "branch-target-enforcement"="false" "sign-return-address"="none" } + +!llvm.module.flags = !{!0, !1, !2, !3} + +!0 = !{i32 8, !"branch-target-enforcement", i32 1} +!1 = !{i32 8, !"sign-return-address", i32 1} +!2 = !{i32 8, !"sign-return-address-all", i32 0} +!3 = !{i32 8, !"sign-return-address-with-bkey", i32 0} diff --git a/llvm/test/LTO/AArch64/Inputs/foo.ll b/llvm/test/LTO/AArch64/Inputs/foo.ll index 961b0d4..689d938 100644 --- a/llvm/test/LTO/AArch64/Inputs/foo.ll +++ b/llvm/test/LTO/AArch64/Inputs/foo.ll @@ -1,12 +1,34 @@ +;; This file contains the previous semantic of the branch-target-enforcement, sign-return-address. +;; Used for test mixing a mixed link case and also verify the import too in llc. + +; RUN: llc -mattr=+pauth -mattr=+bti %s -o - | FileCheck %s + target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-linux-gnu" -define dso_local i32 @foo() #0 { +define i32 @foo_on() #0 { entry: ret i32 42 } +; CHECK-LABEL: foo_on: +; CHECK: pacibsp +; CHECK: mov +; CHECK: retab + +define i32 @foo_off() #1 { +entry: + ret i32 43 +} + +; CHECK-LABEL: foo_off: +; CHECK-NOT: pac +; CHECK-NOT: hint +; CHECK-NOT: bti +; CHECK: ret + attributes #0 = { noinline nounwind optnone uwtable } +attributes #1 = { noinline nounwind optnone uwtable "branch-target-enforcement"="false" "sign-return-address"="none" } !llvm.module.flags = !{!0, !1, !2, !3} diff --git a/llvm/test/LTO/AArch64/Inputs/old.ll b/llvm/test/LTO/AArch64/Inputs/old.ll new file mode 100644 index 0000000..2b1758b --- /dev/null +++ b/llvm/test/LTO/AArch64/Inputs/old.ll @@ -0,0 +1,59 @@ +;; This file contains the previous semantic of the branch-target-enforcement, sign-return-address. +;; Used for test mixing a mixed link case and also verify the import too in llc. + +; RUN: llc -mattr=+pauth -mattr=+bti %s -o - | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +define i32 @old_bti() #0 { +entry: + ret i32 2 +} + +; CHECK-LABEL: old_bti: +; CHECK: bti c +; CHECK: mov +; CHECK: ret + +define i32 @old_pac() #1 { +entry: + ret i32 2 +} + +; CHECK-LABEL: old_pac: +; CHECK: paciasp +; CHECK: mov +; CHECK: retaa + + +define i32 @old_none() #2 { +entry: + ret i32 3 +} + +; CHECK-LABEL: old_none: +; CHECK-NOT: hint +; CHECK-NOT: paci +; CHECK-NOT: bti +; CHECK: ret + +declare i32 @func(i32) + +define i32 @old_none_leaf() #3 { +entry: + %0 = call i32 @func() + ret i32 %0 +} + +; CHECK-LABEL: old_none_leaf: +; CHECK: paciasp +; CHECK: bl func +; CHECK: retaa + +attributes #0 = { noinline nounwind optnone "branch-target-enforcement"="true" } +attributes #1 = { noinline nounwind optnone "branch-target-enforcement"="false" "sign-return-address"="all" "sign-return-address-key"="a_key" } +attributes #2 = { noinline nounwind optnone "branch-target-enforcement"="false" "sign-return-address"="none" } +attributes #3 = { noinline nounwind optnone "branch-target-enforcement"="false" "sign-return-address"="non-leaf" "sign-return-address-key"="a_key" } + +;; Intentionally no module flags diff --git a/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll b/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll index b3c9828..20254de 100644 --- a/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll +++ b/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll @@ -1,10 +1,10 @@ -; Testcase to check that module with different branch-target-enforcement can -; be mixed. -; +;; Testcase to check that module with different branch-target-enforcement can +;; be mixed. +;; ; RUN: llvm-as %s -o %t1.bc ; RUN: llvm-as %p/Inputs/foo.ll -o %t2.bc ; RUN: llvm-lto -exported-symbol main \ -; RUN: -exported-symbol foo \ +; RUN: -exported-symbol foo_on \ ; RUN: -filetype=obj \ ; RUN: %t1.bc %t2.bc \ ; RUN: -o %t1.exe 2>&1 | FileCheck --allow-empty %s @@ -14,11 +14,11 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-linux-gnu" -declare i32 @foo(); +declare i32 @foo_on(); define i32 @main() "sign-return-address"="non-leaf" "sign-return-address-key"="a_key" { entry: - %add = call i32 @foo() + %add = call i32 @foo_on() ret i32 %add } @@ -30,9 +30,12 @@ entry: ; CHECK-NOT: linking module flags 'branch-target-enforcement': IDs have conflicting values in ; CHECK-DUMP: <main>: +; CHECK-DUMP: paciasp +; CHECK-DUMP: str ; CHECK-DUMP: bl 0x8 <main+0x8> -; CHECK-DUMP: <foo>: +; CHECK-DUMP: <foo_on>: +; CHECK-DUMP: pacibsp -; `main` doesn't support BTI while `foo` does, so in the binary -; we should see only PAC which is supported by both. +;; `main` doesn't support BTI while `foo` does, so in the binary +;; we should see only PAC which is supported by both. ; CHECK-PROP: Properties: aarch64 feature: PAC
\ No newline at end of file diff --git a/llvm/test/LTO/AArch64/link-sign-return-address.ll b/llvm/test/LTO/AArch64/link-sign-return-address.ll new file mode 100644 index 0000000..331e481 --- /dev/null +++ b/llvm/test/LTO/AArch64/link-sign-return-address.ll @@ -0,0 +1,127 @@ +;; Testcase to check that module with different sign return address can +;; be mixed. +; +; RUN: llvm-as %s -o %t1.bc +; RUN: llvm-as %p/Inputs/foo.ll -o %t2.bc +; RUN: llvm-as %p/Inputs/fiz.ll -o %t3.bc +; RUN: llvm-as %p/Inputs/bar.ll -o %t4.bc +; RUN: llvm-as %p/Inputs/old.ll -o %t5.bc +; RUN: llvm-lto -exported-symbol main \ +; RUN: -exported-symbol foo_on \ +; RUN: -exported-symbol foo_off \ +; RUN: -exported-symbol fiz_on \ +; RUN: -exported-symbol fiz_off \ +; RUN: -exported-symbol bar \ +; RUN: -exported-symbol baz \ +; RUN: -exported-symbol old_bti \ +; RUN: -exported-symbol old_pac \ +; RUN: -exported-symbol old_none \ +; RUN: -filetype=obj \ +; RUN: %t5.bc %t4.bc %t3.bc %t2.bc %t1.bc \ +; RUN: -o %t1.exe 2>&1 +; RUN: llvm-objdump -d %t1.exe | FileCheck --check-prefix=CHECK-DUMP %s +; RUN: llvm-readelf -n %t1.exe | FileCheck --allow-empty --check-prefix=CHECK-PROP %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +declare i32 @foo_on(); +declare i32 @foo_off(); +declare i32 @fiz_on(); +declare i32 @fiz_off(); +declare void @baz(); +declare void @bar(); +declare i32 @old_bti(); +declare i32 @old_pac(); +declare i32 @old_none(); + +define i32 @main() #0 { +entry: + call i32 @foo_on() + call i32 @foo_off() + call i32 @fiz_on() + call i32 @fiz_off() + call void @bar() + call void @baz() + call i32 @old_bti() + call i32 @old_pac() + call i32 @old_none() + ret i32 0 +} + +attributes #0 = { noinline nounwind optnone } + +!llvm.module.flags = !{!0, !1, !2, !3 } +!0 = !{i32 8, !"branch-target-enforcement", i32 0} +!1 = !{i32 8, !"sign-return-address", i32 0} +!2 = !{i32 8, !"sign-return-address-all", i32 0} +!3 = !{i32 8, !"sign-return-address-with-bkey", i32 0} + + +; CHECK-DUMP-LABEL: <old_bti>: +; CHECK-DUMP-NEXT: bti c +; CHECK-DUMP-NEXT: mov w0, #0x2 +; CHECK-DUMP-NEXT: ret + +; CHECK-DUMP-LABEL: <old_pac>: +; CHECK-DUMP-NEXT: paciasp +; CHECK-DUMP-NEXT: mov w0, #0x2 +; CHECK-DUMP-NEXT: autiasp +; CHECK-DUMP-NEXT: ret + +; CHECK-DUMP-LABEL: <old_none>: +; CHECK-DUMP-NEXT: mov w0, #0x3 +; CHECK-DUMP-NEXT: ret + +; CHECK-DUMP-LABEL: <bar>: +; CHECK-DUMP-NEXT: ret + +; CHECK-DUMP-LABEL: <baz>: +; CHECK-DUMP-NEXT: bti c +; CHECK-DUMP-NEXT: ret + +;; fiz.ll represents a module with the old style of the function attributes. +;; fiz_on shall have PAC with A-key as it requested at module level. +; CHECK-DUMP-LABEL: <fiz_on>: +; CHECK-DUMP-NEXT: paciasp +; CHECK-DUMP-NEXT: str x30, [sp, #-0x10]! +; CHECK-DUMP-NEXT: bl 0x38 <fiz_on+0x8> +; CHECK-DUMP-NEXT: mov w0, #0x2a +; CHECK-DUMP-NEXT: ldr x30, [sp], #0x10 +; CHECK-DUMP-NEXT: autiasp +; CHECK-DUMP-NEXT: ret + +;; fiz_off shall not have BTI or PAC instructions as they are disabled at function scope. +; CHECK-DUMP-LABEL: <fiz_off>: +; CHECK-DUMP-NEXT: mov w0, #0x2b +; CHECK-DUMP-NEXT: ret + +;; foo.ll represents a module with the old style of the function attributes. +;; foo_on shall have PAC with B-key as it requested at module level. +; CHECK-DUMP-LABEL: <foo_on>: +; CHECK-DUMP-NEXT: pacibsp +; CHECK-DUMP-NEXT: mov w0, #0x2a +; CHECK-DUMP-NEXT: autibsp +; CHECK-DUMP-NEXT: ret + +;; foo_off shall not have BTI or PAC instructions as they are disabled at function scope. +; CHECK-DUMP-LABEL: <foo_off>: +; CHECK-DUMP-NEXT: mov w0, #0x2b +; CHECK-DUMP-NEXT: ret + +; CHECK-DUMP-LABEL: <main>: +; CHECK-DUMP-NOT: paciasp +; CHECK-DUMP-NEXT: str x30, +; CHECK-DUMP-NEXT: bl +; CHECK-DUMP-NEXT: bl +; CHECK-DUMP-NEXT: bl +; CHECK-DUMP-NEXT: bl +; CHECK-DUMP-NEXT: bl +; CHECK-DUMP-NEXT: bl +; CHECK-DUMP-NEXT: bl +; CHECK-DUMP-NEXT: bl +; CHECK-DUMP-NEXT: bl + +;; `main` doesn't support PAC sign-return-address while `foo` does, so in the binary +;; we should not see anything. +; CHECK-PROP-NOT: Properties: aarch64 feature: PAC diff --git a/llvm/test/Linker/link-arm-and-thumb.ll b/llvm/test/Linker/link-arm-and-thumb.ll index a90f212..b5984bf 100644 --- a/llvm/test/Linker/link-arm-and-thumb.ll +++ b/llvm/test/Linker/link-arm-and-thumb.ll @@ -13,11 +13,11 @@ entry: ret i32 %add } -; CHECK: define i32 @main() { +; CHECK: define i32 @main() ; CHECK: define i32 @foo(i32 %a, i32 %b) [[ARM_ATTRS:#[0-9]+]] ; CHECK: define i32 @bar(i32 %a, i32 %b) [[THUMB_ATTRS:#[0-9]+]] -; CHECK: attributes [[ARM_ATTRS]] = { "target-features"="-thumb-mode" } -; CHECK: attributes [[THUMB_ATTRS]] = { "target-features"="+thumb-mode" } +; CHECK: attributes [[ARM_ATTRS]] = {{{.*}}"target-features"="-thumb-mode" } +; CHECK: attributes [[THUMB_ATTRS]] = {{{.*}}"target-features"="+thumb-mode" } ; STDERR-NOT: warning: Linking two modules of different target triples: diff --git a/llvm/test/MC/AMDGPU/literals.s b/llvm/test/MC/AMDGPU/literals.s index 78aa8f2..3faea99 100644 --- a/llvm/test/MC/AMDGPU/literals.s +++ b/llvm/test/MC/AMDGPU/literals.s @@ -20,282 +20,282 @@ //---------------------------------------------------------------------------// v_fract_f64 v[0:1], 0.5 -// SICI: v_fract_f64_e32 v[0:1], 0.5 ; encoding: [0xf0,0x7c,0x00,0x7e] -// GFX89: v_fract_f64_e32 v[0:1], 0.5 ; encoding: [0xf0,0x64,0x00,0x7e] -// GFX12XX: v_fract_f64_e32 v[0:1], 0.5 ; encoding: [0xf0,0x7c,0x00,0x7e] // GFX11: v_fract_f64_e32 v[0:1], 0.5 ; encoding: [0xf0,0x7c,0x00,0x7e] +// GFX12XX: v_fract_f64_e32 v[0:1], 0.5 ; encoding: [0xf0,0x7c,0x00,0x7e] +// GFX89: v_fract_f64_e32 v[0:1], 0.5 ; encoding: [0xf0,0x64,0x00,0x7e] +// SICI: v_fract_f64_e32 v[0:1], 0.5 ; encoding: [0xf0,0x7c,0x00,0x7e] v_sqrt_f64 v[0:1], -4.0 -// SICI: v_sqrt_f64_e32 v[0:1], -4.0 ; encoding: [0xf7,0x68,0x00,0x7e] -// GFX89: v_sqrt_f64_e32 v[0:1], -4.0 ; encoding: [0xf7,0x50,0x00,0x7e] -// GFX12XX: v_sqrt_f64_e32 v[0:1], -4.0 ; encoding: [0xf7,0x68,0x00,0x7e] // GFX11: v_sqrt_f64_e32 v[0:1], -4.0 ; encoding: [0xf7,0x68,0x00,0x7e] +// GFX12XX: v_sqrt_f64_e32 v[0:1], -4.0 ; encoding: [0xf7,0x68,0x00,0x7e] +// GFX89: v_sqrt_f64_e32 v[0:1], -4.0 ; encoding: [0xf7,0x50,0x00,0x7e] +// SICI: v_sqrt_f64_e32 v[0:1], -4.0 ; encoding: [0xf7,0x68,0x00,0x7e] v_log_clamp_f32 v1, 0.5 // NOGFX8PLUS: :[[@LINE-1]]:1: error: instruction not supported on this GPU // SICI: v_log_clamp_f32_e32 v1, 0.5 ; encoding: [0xf0,0x4c,0x02,0x7e] v_trunc_f32 v0, 0.5 -// SICI: v_trunc_f32_e32 v0, 0.5 ; encoding: [0xf0,0x42,0x00,0x7e] -// GFX89: v_trunc_f32_e32 v0, 0.5 ; encoding: [0xf0,0x38,0x00,0x7e] -// GFX12XX: v_trunc_f32_e32 v0, 0.5 ; encoding: [0xf0,0x42,0x00,0x7e] // GFX11: v_trunc_f32_e32 v0, 0.5 ; encoding: [0xf0,0x42,0x00,0x7e] +// GFX12XX: v_trunc_f32_e32 v0, 0.5 ; encoding: [0xf0,0x42,0x00,0x7e] +// GFX89: v_trunc_f32_e32 v0, 0.5 ; encoding: [0xf0,0x38,0x00,0x7e] +// SICI: v_trunc_f32_e32 v0, 0.5 ; encoding: [0xf0,0x42,0x00,0x7e] v_fract_f64 v[0:1], -1.0 -// SICI: v_fract_f64_e32 v[0:1], -1.0 ; encoding: [0xf3,0x7c,0x00,0x7e] -// GFX89: v_fract_f64_e32 v[0:1], -1.0 ; encoding: [0xf3,0x64,0x00,0x7e] -// GFX12XX: v_fract_f64_e32 v[0:1], -1.0 ; encoding: [0xf3,0x7c,0x00,0x7e] // GFX11: v_fract_f64_e32 v[0:1], -1.0 ; encoding: [0xf3,0x7c,0x00,0x7e] +// GFX12XX: v_fract_f64_e32 v[0:1], -1.0 ; encoding: [0xf3,0x7c,0x00,0x7e] +// GFX89: v_fract_f64_e32 v[0:1], -1.0 ; encoding: [0xf3,0x64,0x00,0x7e] +// SICI: v_fract_f64_e32 v[0:1], -1.0 ; encoding: [0xf3,0x7c,0x00,0x7e] v_trunc_f32 v0, -1.0 -// SICI: v_trunc_f32_e32 v0, -1.0 ; encoding: [0xf3,0x42,0x00,0x7e] -// GFX89: v_trunc_f32_e32 v0, -1.0 ; encoding: [0xf3,0x38,0x00,0x7e] -// GFX12XX: v_trunc_f32_e32 v0, -1.0 ; encoding: [0xf3,0x42,0x00,0x7e] // GFX11: v_trunc_f32_e32 v0, -1.0 ; encoding: [0xf3,0x42,0x00,0x7e] +// GFX12XX: v_trunc_f32_e32 v0, -1.0 ; encoding: [0xf3,0x42,0x00,0x7e] +// GFX89: v_trunc_f32_e32 v0, -1.0 ; encoding: [0xf3,0x38,0x00,0x7e] +// SICI: v_trunc_f32_e32 v0, -1.0 ; encoding: [0xf3,0x42,0x00,0x7e] v_fract_f64 v[0:1], 4.0 -// SICI: v_fract_f64_e32 v[0:1], 4.0 ; encoding: [0xf6,0x7c,0x00,0x7e] -// GFX89: v_fract_f64_e32 v[0:1], 4.0 ; encoding: [0xf6,0x64,0x00,0x7e] -// GFX12XX: v_fract_f64_e32 v[0:1], 4.0 ; encoding: [0xf6,0x7c,0x00,0x7e] // GFX11: v_fract_f64_e32 v[0:1], 4.0 ; encoding: [0xf6,0x7c,0x00,0x7e] +// GFX12XX: v_fract_f64_e32 v[0:1], 4.0 ; encoding: [0xf6,0x7c,0x00,0x7e] +// GFX89: v_fract_f64_e32 v[0:1], 4.0 ; encoding: [0xf6,0x64,0x00,0x7e] +// SICI: v_fract_f64_e32 v[0:1], 4.0 ; encoding: [0xf6,0x7c,0x00,0x7e] v_trunc_f32 v0, 4.0 -// SICI: v_trunc_f32_e32 v0, 4.0 ; encoding: [0xf6,0x42,0x00,0x7e] -// GFX89: v_trunc_f32_e32 v0, 4.0 ; encoding: [0xf6,0x38,0x00,0x7e] -// GFX12XX: v_trunc_f32_e32 v0, 4.0 ; encoding: [0xf6,0x42,0x00,0x7e] // GFX11: v_trunc_f32_e32 v0, 4.0 ; encoding: [0xf6,0x42,0x00,0x7e] +// GFX12XX: v_trunc_f32_e32 v0, 4.0 ; encoding: [0xf6,0x42,0x00,0x7e] +// GFX89: v_trunc_f32_e32 v0, 4.0 ; encoding: [0xf6,0x38,0x00,0x7e] +// SICI: v_trunc_f32_e32 v0, 4.0 ; encoding: [0xf6,0x42,0x00,0x7e] v_fract_f64 v[0:1], 0.0 -// SICI: v_fract_f64_e32 v[0:1], 0 ; encoding: [0x80,0x7c,0x00,0x7e] -// GFX89: v_fract_f64_e32 v[0:1], 0 ; encoding: [0x80,0x64,0x00,0x7e] -// GFX12XX: v_fract_f64_e32 v[0:1], 0 ; encoding: [0x80,0x7c,0x00,0x7e] // GFX11: v_fract_f64_e32 v[0:1], 0 ; encoding: [0x80,0x7c,0x00,0x7e] +// GFX12XX: v_fract_f64_e32 v[0:1], 0 ; encoding: [0x80,0x7c,0x00,0x7e] +// GFX89: v_fract_f64_e32 v[0:1], 0 ; encoding: [0x80,0x64,0x00,0x7e] +// SICI: v_fract_f64_e32 v[0:1], 0 ; encoding: [0x80,0x7c,0x00,0x7e] v_trunc_f32 v0, 0.0 -// SICI: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x42,0x00,0x7e] -// GFX89: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x38,0x00,0x7e] -// GFX12XX: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x42,0x00,0x7e] // GFX11: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x42,0x00,0x7e] +// GFX12XX: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x42,0x00,0x7e] +// GFX89: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x38,0x00,0x7e] +// SICI: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x42,0x00,0x7e] v_fract_f64 v[0:1], 1.5 -// SICI: v_fract_f64_e32 v[0:1], 0x3ff80000 ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf8,0x3f] -// GFX89: v_fract_f64_e32 v[0:1], 0x3ff80000 ; encoding: [0xff,0x64,0x00,0x7e,0x00,0x00,0xf8,0x3f] -// GFX12XX: v_fract_f64_e32 v[0:1], 0x3ff80000 ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf8,0x3f] // GFX11: v_fract_f64_e32 v[0:1], 0x3ff80000 ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf8,0x3f] +// GFX12XX: v_fract_f64_e32 v[0:1], 0x3ff80000 ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf8,0x3f] +// GFX89: v_fract_f64_e32 v[0:1], 0x3ff80000 ; encoding: [0xff,0x64,0x00,0x7e,0x00,0x00,0xf8,0x3f] +// SICI: v_fract_f64_e32 v[0:1], 0x3ff80000 ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf8,0x3f] v_trunc_f32 v0, 1.5 -// SICI: v_trunc_f32_e32 v0, 0x3fc00000 ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0xc0,0x3f] -// GFX89: v_trunc_f32_e32 v0, 0x3fc00000 ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x00,0xc0,0x3f] -// GFX12XX: v_trunc_f32_e32 v0, 0x3fc00000 ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0xc0,0x3f] // GFX11: v_trunc_f32_e32 v0, 0x3fc00000 ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0xc0,0x3f] +// GFX12XX: v_trunc_f32_e32 v0, 0x3fc00000 ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0xc0,0x3f] +// GFX89: v_trunc_f32_e32 v0, 0x3fc00000 ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x00,0xc0,0x3f] +// SICI: v_trunc_f32_e32 v0, 0x3fc00000 ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0xc0,0x3f] v_fract_f64 v[0:1], -3.1415 -// SICI: v_fract_f64_e32 v[0:1], 0xc00921ca ; encoding: [0xff,0x7c,0x00,0x7e,0xca,0x21,0x09,0xc0] -// GFX89: v_fract_f64_e32 v[0:1], 0xc00921ca ; encoding: [0xff,0x64,0x00,0x7e,0xca,0x21,0x09,0xc0] -// NOSICI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero -// NOGFX89: :[[@LINE-4]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero // GFX11: v_fract_f64_e32 v[0:1], 0xc00921ca ; encoding: [0xff,0x7c,0x00,0x7e,0xca,0x21,0x09,0xc0] // GFX12: v_fract_f64_e32 v[0:1], 0xc00921ca ; encoding: [0xff,0x7c,0x00,0x7e,0xca,0x21,0x09,0xc0] // GFX1250: v_fract_f64_e32 v[0:1], 0xc00921cac083126f ; encoding: [0xfe,0x7c,0x00,0x7e,0x6f,0x12,0x83,0xc0,0xca,0x21,0x09,0xc0] -// NOGFX11: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero -// NOGFX12: :[[@LINE-9]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// GFX89: v_fract_f64_e32 v[0:1], 0xc00921ca ; encoding: [0xff,0x64,0x00,0x7e,0xca,0x21,0x09,0xc0] +// NOGFX11: :[[@LINE-5]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// NOGFX12: :[[@LINE-6]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// NOGFX89: :[[@LINE-7]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// NOSICI: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// SICI: v_fract_f64_e32 v[0:1], 0xc00921ca ; encoding: [0xff,0x7c,0x00,0x7e,0xca,0x21,0x09,0xc0] // NOSICIVI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero v_trunc_f32 v0, -3.1415 -// SICI: v_trunc_f32_e32 v0, 0xc0490e56 ; encoding: [0xff,0x42,0x00,0x7e,0x56,0x0e,0x49,0xc0] -// GFX89: v_trunc_f32_e32 v0, 0xc0490e56 ; encoding: [0xff,0x38,0x00,0x7e,0x56,0x0e,0x49,0xc0] -// GFX12XX: v_trunc_f32_e32 v0, 0xc0490e56 ; encoding: [0xff,0x42,0x00,0x7e,0x56,0x0e,0x49,0xc0] // GFX11: v_trunc_f32_e32 v0, 0xc0490e56 ; encoding: [0xff,0x42,0x00,0x7e,0x56,0x0e,0x49,0xc0] +// GFX12XX: v_trunc_f32_e32 v0, 0xc0490e56 ; encoding: [0xff,0x42,0x00,0x7e,0x56,0x0e,0x49,0xc0] +// GFX89: v_trunc_f32_e32 v0, 0xc0490e56 ; encoding: [0xff,0x38,0x00,0x7e,0x56,0x0e,0x49,0xc0] +// SICI: v_trunc_f32_e32 v0, 0xc0490e56 ; encoding: [0xff,0x42,0x00,0x7e,0x56,0x0e,0x49,0xc0] v_fract_f64 v[0:1], 100000000000000000000000.0 -// SICI: v_fract_f64_e32 v[0:1], 0x44b52d02 ; encoding: [0xff,0x7c,0x00,0x7e,0x02,0x2d,0xb5,0x44] -// GFX89: v_fract_f64_e32 v[0:1], 0x44b52d02 ; encoding: [0xff,0x64,0x00,0x7e,0x02,0x2d,0xb5,0x44] -// NOSICI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero -// NOGFX89: :[[@LINE-4]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero // GFX11: v_fract_f64_e32 v[0:1], 0x44b52d02 ; encoding: [0xff,0x7c,0x00,0x7e,0x02,0x2d,0xb5,0x44] // GFX12: v_fract_f64_e32 v[0:1], 0x44b52d02 ; encoding: [0xff,0x7c,0x00,0x7e,0x02,0x2d,0xb5,0x44] // GFX1250: v_fract_f64_e32 v[0:1], 0x44b52d02c7e14af6 ; encoding: [0xfe,0x7c,0x00,0x7e,0xf6,0x4a,0xe1,0xc7,0x02,0x2d,0xb5,0x44] -// NOGFX11: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero -// NOGFX12: :[[@LINE-9]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// GFX89: v_fract_f64_e32 v[0:1], 0x44b52d02 ; encoding: [0xff,0x64,0x00,0x7e,0x02,0x2d,0xb5,0x44] +// NOGFX11: :[[@LINE-5]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// NOGFX12: :[[@LINE-6]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// NOGFX89: :[[@LINE-7]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// NOSICI: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// SICI: v_fract_f64_e32 v[0:1], 0x44b52d02 ; encoding: [0xff,0x7c,0x00,0x7e,0x02,0x2d,0xb5,0x44] // NOSICIVI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero v_trunc_f32 v0, 100000000000000000000000.0 -// SICI: v_trunc_f32_e32 v0, 0x65a96816 ; encoding: [0xff,0x42,0x00,0x7e,0x16,0x68,0xa9,0x65] -// GFX89: v_trunc_f32_e32 v0, 0x65a96816 ; encoding: [0xff,0x38,0x00,0x7e,0x16,0x68,0xa9,0x65] -// GFX12XX: v_trunc_f32_e32 v0, 0x65a96816 ; encoding: [0xff,0x42,0x00,0x7e,0x16,0x68,0xa9,0x65] // GFX11: v_trunc_f32_e32 v0, 0x65a96816 ; encoding: [0xff,0x42,0x00,0x7e,0x16,0x68,0xa9,0x65] +// GFX12XX: v_trunc_f32_e32 v0, 0x65a96816 ; encoding: [0xff,0x42,0x00,0x7e,0x16,0x68,0xa9,0x65] +// GFX89: v_trunc_f32_e32 v0, 0x65a96816 ; encoding: [0xff,0x38,0x00,0x7e,0x16,0x68,0xa9,0x65] +// SICI: v_trunc_f32_e32 v0, 0x65a96816 ; encoding: [0xff,0x42,0x00,0x7e,0x16,0x68,0xa9,0x65] v_fract_f64 v[0:1], 10000000.0 -// SICI: v_fract_f64_e32 v[0:1], 0x416312d0 ; encoding: [0xff,0x7c,0x00,0x7e,0xd0,0x12,0x63,0x41] -// GFX89: v_fract_f64_e32 v[0:1], 0x416312d0 ; encoding: [0xff,0x64,0x00,0x7e,0xd0,0x12,0x63,0x41] -// GFX12XX: v_fract_f64_e32 v[0:1], 0x416312d0 ; encoding: [0xff,0x7c,0x00,0x7e,0xd0,0x12,0x63,0x41] // GFX11: v_fract_f64_e32 v[0:1], 0x416312d0 ; encoding: [0xff,0x7c,0x00,0x7e,0xd0,0x12,0x63,0x41] +// GFX12XX: v_fract_f64_e32 v[0:1], 0x416312d0 ; encoding: [0xff,0x7c,0x00,0x7e,0xd0,0x12,0x63,0x41] +// GFX89: v_fract_f64_e32 v[0:1], 0x416312d0 ; encoding: [0xff,0x64,0x00,0x7e,0xd0,0x12,0x63,0x41] +// SICI: v_fract_f64_e32 v[0:1], 0x416312d0 ; encoding: [0xff,0x7c,0x00,0x7e,0xd0,0x12,0x63,0x41] v_trunc_f32 v0, 10000000.0 -// SICI: v_trunc_f32_e32 v0, 0x4b189680 ; encoding: [0xff,0x42,0x00,0x7e,0x80,0x96,0x18,0x4b] -// GFX89: v_trunc_f32_e32 v0, 0x4b189680 ; encoding: [0xff,0x38,0x00,0x7e,0x80,0x96,0x18,0x4b] -// GFX12XX: v_trunc_f32_e32 v0, 0x4b189680 ; encoding: [0xff,0x42,0x00,0x7e,0x80,0x96,0x18,0x4b] // GFX11: v_trunc_f32_e32 v0, 0x4b189680 ; encoding: [0xff,0x42,0x00,0x7e,0x80,0x96,0x18,0x4b] +// GFX12XX: v_trunc_f32_e32 v0, 0x4b189680 ; encoding: [0xff,0x42,0x00,0x7e,0x80,0x96,0x18,0x4b] +// GFX89: v_trunc_f32_e32 v0, 0x4b189680 ; encoding: [0xff,0x38,0x00,0x7e,0x80,0x96,0x18,0x4b] +// SICI: v_trunc_f32_e32 v0, 0x4b189680 ; encoding: [0xff,0x42,0x00,0x7e,0x80,0x96,0x18,0x4b] v_fract_f64 v[0:1], 3.402823e+38 -// SICI: v_fract_f64_e32 v[0:1], 0x47efffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xef,0x47] -// GFX89: v_fract_f64_e32 v[0:1], 0x47efffff ; encoding: [0xff,0x64,0x00,0x7e,0xff,0xff,0xef,0x47] -// NOSICI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero -// NOGFX89: :[[@LINE-4]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero // GFX11: v_fract_f64_e32 v[0:1], 0x47efffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xef,0x47] // GFX12: v_fract_f64_e32 v[0:1], 0x47efffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xef,0x47] // GFX1250: v_fract_f64_e32 v[0:1], 0x47efffff966ad924 ; encoding: [0xfe,0x7c,0x00,0x7e,0x24,0xd9,0x6a,0x96,0xff,0xff,0xef,0x47] -// NOGFX11: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero -// NOGFX12: :[[@LINE-9]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// GFX89: v_fract_f64_e32 v[0:1], 0x47efffff ; encoding: [0xff,0x64,0x00,0x7e,0xff,0xff,0xef,0x47] +// NOGFX11: :[[@LINE-5]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// NOGFX12: :[[@LINE-6]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// NOGFX89: :[[@LINE-7]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// NOSICI: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// SICI: v_fract_f64_e32 v[0:1], 0x47efffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xef,0x47] // NOSICIVI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero v_trunc_f32 v0, 3.402823e+38 -// SICI: v_trunc_f32_e32 v0, 0x7f7ffffd ; encoding: [0xff,0x42,0x00,0x7e,0xfd,0xff,0x7f,0x7f] -// GFX89: v_trunc_f32_e32 v0, 0x7f7ffffd ; encoding: [0xff,0x38,0x00,0x7e,0xfd,0xff,0x7f,0x7f] -// GFX12XX: v_trunc_f32_e32 v0, 0x7f7ffffd ; encoding: [0xff,0x42,0x00,0x7e,0xfd,0xff,0x7f,0x7f] // GFX11: v_trunc_f32_e32 v0, 0x7f7ffffd ; encoding: [0xff,0x42,0x00,0x7e,0xfd,0xff,0x7f,0x7f] +// GFX12XX: v_trunc_f32_e32 v0, 0x7f7ffffd ; encoding: [0xff,0x42,0x00,0x7e,0xfd,0xff,0x7f,0x7f] +// GFX89: v_trunc_f32_e32 v0, 0x7f7ffffd ; encoding: [0xff,0x38,0x00,0x7e,0xfd,0xff,0x7f,0x7f] +// SICI: v_trunc_f32_e32 v0, 0x7f7ffffd ; encoding: [0xff,0x42,0x00,0x7e,0xfd,0xff,0x7f,0x7f] v_fract_f64 v[0:1], 2.3509886e-38 -// SICI: v_fract_f64_e32 v[0:1], 0x381fffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0x1f,0x38] -// GFX89: v_fract_f64_e32 v[0:1], 0x381fffff ; encoding: [0xff,0x64,0x00,0x7e,0xff,0xff,0x1f,0x38] -// NOSICI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero -// NOGFX89: :[[@LINE-4]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero // GFX11: v_fract_f64_e32 v[0:1], 0x381fffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0x1f,0x38] // GFX12: v_fract_f64_e32 v[0:1], 0x381fffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0x1f,0x38] // GFX1250: v_fract_f64_e32 v[0:1], 0x381fffffe8c9d9fb ; encoding: [0xfe,0x7c,0x00,0x7e,0xfb,0xd9,0xc9,0xe8,0xff,0xff,0x1f,0x38] -// NOGFX11: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero -// NOGFX12: :[[@LINE-9]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// GFX89: v_fract_f64_e32 v[0:1], 0x381fffff ; encoding: [0xff,0x64,0x00,0x7e,0xff,0xff,0x1f,0x38] +// NOGFX11: :[[@LINE-5]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// NOGFX12: :[[@LINE-6]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// NOGFX89: :[[@LINE-7]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// NOSICI: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// SICI: v_fract_f64_e32 v[0:1], 0x381fffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0x1f,0x38] // NOSICIVI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero v_trunc_f32 v0, 2.3509886e-38 -// SICI: v_trunc_f32_e32 v0, 0xffffff ; encoding: [0xff,0x42,0x00,0x7e,0xff,0xff,0xff,0x00] -// GFX89: v_trunc_f32_e32 v0, 0xffffff ; encoding: [0xff,0x38,0x00,0x7e,0xff,0xff,0xff,0x00] -// GFX12XX: v_trunc_f32_e32 v0, 0xffffff ; encoding: [0xff,0x42,0x00,0x7e,0xff,0xff,0xff,0x00] // GFX11: v_trunc_f32_e32 v0, 0xffffff ; encoding: [0xff,0x42,0x00,0x7e,0xff,0xff,0xff,0x00] +// GFX12XX: v_trunc_f32_e32 v0, 0xffffff ; encoding: [0xff,0x42,0x00,0x7e,0xff,0xff,0xff,0x00] +// GFX89: v_trunc_f32_e32 v0, 0xffffff ; encoding: [0xff,0x38,0x00,0x7e,0xff,0xff,0xff,0x00] +// SICI: v_trunc_f32_e32 v0, 0xffffff ; encoding: [0xff,0x42,0x00,0x7e,0xff,0xff,0xff,0x00] v_fract_f64 v[0:1], 2.3509886e-70 -// SICI: v_fract_f64_e32 v[0:1], 0x3179f623 ; encoding: [0xff,0x7c,0x00,0x7e,0x23,0xf6,0x79,0x31] -// GFX89: v_fract_f64_e32 v[0:1], 0x3179f623 ; encoding: [0xff,0x64,0x00,0x7e,0x23,0xf6,0x79,0x31] -// NOSICI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero -// NOGFX89: :[[@LINE-4]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero // GFX11: v_fract_f64_e32 v[0:1], 0x3179f623 ; encoding: [0xff,0x7c,0x00,0x7e,0x23,0xf6,0x79,0x31] // GFX12: v_fract_f64_e32 v[0:1], 0x3179f623 ; encoding: [0xff,0x7c,0x00,0x7e,0x23,0xf6,0x79,0x31] // GFX1250: v_fract_f64_e32 v[0:1], 0x3179f623c2d3cf3c ; encoding: [0xfe,0x7c,0x00,0x7e,0x3c,0xcf,0xd3,0xc2,0x23,0xf6,0x79,0x31] -// NOGFX11: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero -// NOGFX12: :[[@LINE-9]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// GFX89: v_fract_f64_e32 v[0:1], 0x3179f623 ; encoding: [0xff,0x64,0x00,0x7e,0x23,0xf6,0x79,0x31] +// NOGFX11: :[[@LINE-5]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// NOGFX12: :[[@LINE-6]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// NOGFX89: :[[@LINE-7]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// NOSICI: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// SICI: v_fract_f64_e32 v[0:1], 0x3179f623 ; encoding: [0xff,0x7c,0x00,0x7e,0x23,0xf6,0x79,0x31] // NOSICIVI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero v_trunc_f32 v0, 2.3509886e-70 // NOGCN: :[[@LINE-1]]:17: error: invalid operand for instruction v_fract_f64_e32 v[0:1], 1.0 -// SICI: v_fract_f64_e32 v[0:1], 1.0 ; encoding: [0xf2,0x7c,0x00,0x7e] -// GFX89: v_fract_f64_e32 v[0:1], 1.0 ; encoding: [0xf2,0x64,0x00,0x7e] -// GFX12XX: v_fract_f64_e32 v[0:1], 1.0 ; encoding: [0xf2,0x7c,0x00,0x7e] // GFX11: v_fract_f64_e32 v[0:1], 1.0 ; encoding: [0xf2,0x7c,0x00,0x7e] +// GFX12XX: v_fract_f64_e32 v[0:1], 1.0 ; encoding: [0xf2,0x7c,0x00,0x7e] +// GFX89: v_fract_f64_e32 v[0:1], 1.0 ; encoding: [0xf2,0x64,0x00,0x7e] +// SICI: v_fract_f64_e32 v[0:1], 1.0 ; encoding: [0xf2,0x7c,0x00,0x7e] v_fract_f64_e32 v[0:1], lit(1.0) -// SICI: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf0,0x3f] -// GFX89: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x64,0x00,0x7e,0x00,0x00,0xf0,0x3f] // GFX11: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf0,0x3f] // GFX12: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf0,0x3f] // GFX1250: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xfe,0x7c,0x00,0x7e,0x00,0x00,0xf0,0x3f,0x00,0x00,0x00,0x00] +// GFX89: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x64,0x00,0x7e,0x00,0x00,0xf0,0x3f] +// SICI: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf0,0x3f] v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1.0 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0xca,0x1b] -// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode -// NOGFX1250: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode +// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode +// NOGFX1250: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], lit(1.0) -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU -// NOGFX11: :[[@LINE-3]]:54: error: invalid operand for instruction -// NOGFX12: :[[@LINE-4]]:54: error: invalid operand for instruction -// NOGFX1250: :[[@LINE-5]]:54: error: invalid operand for instruction +// NOGFX11: :[[@LINE-1]]:54: error: invalid operand for instruction +// NOGFX12: :[[@LINE-2]]:54: error: invalid operand for instruction +// NOGFX1250: :[[@LINE-3]]:54: error: invalid operand for instruction +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_cos_f16_e32 v5.l, 1.0 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode // GFX11: v_cos_f16_e32 v5.l, 1.0 ; encoding: [0xf2,0xc2,0x0a,0x7e] // GFX1250: v_cos_f16_e32 v5.l, 1.0 ; encoding: [0xf2,0xc2,0x0a,0x7e] -// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode +// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode +// NOGFX89: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_cos_f16_e32 v5.l, lit(1.0) -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode // GFX11: v_cos_f16_e32 v5.l, lit(0x3c00) ; encoding: [0xff,0xc2,0x0a,0x7e,0x00,0x3c,0x00,0x00] // GFX1250: v_cos_f16_e32 v5.l, lit(0x3c00) ; encoding: [0xff,0xc2,0x0a,0x7e,0x00,0x3c,0x00,0x00] -// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode +// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode +// NOGFX89: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_tanh_bf16 v5, 1.0 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX1250: v_tanh_bf16_e32 v5, 1.0 ; encoding: [0xf2,0x94,0x0a,0x7e] -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_tanh_bf16 v5, lit(1.0) -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX1250: v_tanh_bf16_e32 v5, lit(0x3f80) ; encoding: [0xff,0x94,0x0a,0x7e,0x80,0x3f,0x00,0x00] -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_trunc_f32_e32 v0, 1.0 -// SICI: v_trunc_f32_e32 v0, 1.0 ; encoding: [0xf2,0x42,0x00,0x7e] -// GFX89: v_trunc_f32_e32 v0, 1.0 ; encoding: [0xf2,0x38,0x00,0x7e] -// GFX12XX: v_trunc_f32_e32 v0, 1.0 ; encoding: [0xf2,0x42,0x00,0x7e] // GFX11: v_trunc_f32_e32 v0, 1.0 ; encoding: [0xf2,0x42,0x00,0x7e] +// GFX12XX: v_trunc_f32_e32 v0, 1.0 ; encoding: [0xf2,0x42,0x00,0x7e] +// GFX89: v_trunc_f32_e32 v0, 1.0 ; encoding: [0xf2,0x38,0x00,0x7e] +// SICI: v_trunc_f32_e32 v0, 1.0 ; encoding: [0xf2,0x42,0x00,0x7e] v_trunc_f32_e32 v0, lit(1.0) -// SICI: v_trunc_f32_e32 v0, lit(0x3f800000) ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0x80,0x3f] -// GFX89: v_trunc_f32_e32 v0, lit(0x3f800000) ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x00,0x80,0x3f] -// GFX12XX: v_trunc_f32_e32 v0, lit(0x3f800000) ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0x80,0x3f] // GFX11: v_trunc_f32_e32 v0, lit(0x3f800000) ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0x80,0x3f] +// GFX12XX: v_trunc_f32_e32 v0, lit(0x3f800000) ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0x80,0x3f] +// GFX89: v_trunc_f32_e32 v0, lit(0x3f800000) ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x00,0x80,0x3f] +// SICI: v_trunc_f32_e32 v0, lit(0x3f800000) ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0x80,0x3f] v_dot2_bf16_bf16 v5.l, v1, v2, 1.0 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_dot2_bf16_bf16 v5.l, v1, v2, 1.0 ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0xca,0x03] -// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_dot2_bf16_bf16 v5.l, v1, v2, lit(1.0) -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_dot2_bf16_bf16 v5.l, v1, v2, lit(0x3f80) ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0xfe,0x03,0x80,0x3f,0x00,0x00] -// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_dot2_f32_f16 v5, v1, 1.0, v2 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_dot2_f32_f16 v5, v1, 1.0, v2 ; encoding: [0x05,0x40,0x13,0xcc,0x01,0xe5,0x09,0x1c] // GFX12: v_dot2_f32_f16 v5, v1, 1.0, v2 ; encoding: [0x05,0x40,0x13,0xcc,0x01,0xe5,0x09,0x1c] -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_dot2_f32_f16 v5, v1, lit(1.0), v2 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_dot2_f32_f16 v5, v1, lit(0x3c00), v2 ; encoding: [0x05,0x40,0x13,0xcc,0x01,0xff,0x09,0x1c,0x00,0x3c,0x00,0x00] // GFX12: v_dot2_f32_f16 v5, v1, lit(0x3c00), v2 ; encoding: [0x05,0x40,0x13,0xcc,0x01,0xff,0x09,0x1c,0x00,0x3c,0x00,0x00] -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_cvt_pk_fp8_f16 v1.l, 1.0 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX1250: v_cvt_pk_fp8_f16 v1.l, 0x3c00 ; encoding: [0x01,0x00,0x72,0xd7,0xff,0x00,0x00,0x00,0x00,0x3c,0x00,0x00] -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_cvt_pk_fp8_f16 v1.l, lit(1.0) -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX1250: v_cvt_pk_fp8_f16 v1.l, lit(0x3c00) ; encoding: [0x01,0x00,0x72,0xd7,0xff,0x00,0x00,0x00,0x00,0x3c,0x00,0x00] -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU //---------------------------------------------------------------------------// // fp literal, expected int operand @@ -309,118 +309,118 @@ s_mov_b64 s[0:1], lit(0.5) // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction v_and_b32_e32 v0, 0.5, v1 -// SICI: v_and_b32_e32 v0, 0.5, v1 ; encoding: [0xf0,0x02,0x00,0x36] -// GFX89: v_and_b32_e32 v0, 0.5, v1 ; encoding: [0xf0,0x02,0x00,0x26] -// GFX12XX: v_and_b32_e32 v0, 0.5, v1 ; encoding: [0xf0,0x02,0x00,0x36] // GFX11: v_and_b32_e32 v0, 0.5, v1 ; encoding: [0xf0,0x02,0x00,0x36] +// GFX12XX: v_and_b32_e32 v0, 0.5, v1 ; encoding: [0xf0,0x02,0x00,0x36] +// GFX89: v_and_b32_e32 v0, 0.5, v1 ; encoding: [0xf0,0x02,0x00,0x26] +// SICI: v_and_b32_e32 v0, 0.5, v1 ; encoding: [0xf0,0x02,0x00,0x36] v_and_b32_e64 v0, 0.5, v1 -// SICI: v_and_b32_e64 v0, 0.5, v1 ; encoding: [0x00,0x00,0x36,0xd2,0xf0,0x02,0x02,0x00] -// GFX89: v_and_b32_e64 v0, 0.5, v1 ; encoding: [0x00,0x00,0x13,0xd1,0xf0,0x02,0x02,0x00] -// GFX12XX: v_and_b32_e64 v0, 0.5, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xf0,0x02,0x02,0x00] // GFX11: v_and_b32_e64 v0, 0.5, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xf0,0x02,0x02,0x00] +// GFX12XX: v_and_b32_e64 v0, 0.5, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xf0,0x02,0x02,0x00] +// GFX89: v_and_b32_e64 v0, 0.5, v1 ; encoding: [0x00,0x00,0x13,0xd1,0xf0,0x02,0x02,0x00] +// SICI: v_and_b32_e64 v0, 0.5, v1 ; encoding: [0x00,0x00,0x36,0xd2,0xf0,0x02,0x02,0x00] s_mov_b64_e32 s[0:1], -1.0 // GFX8PLUS: s_mov_b64 s[0:1], -1.0 ; encoding: [0xf3,0x01,0x80,0xbe] // SICI: s_mov_b64 s[0:1], -1.0 ; encoding: [0xf3,0x04,0x80,0xbe] v_and_b32_e32 v0, -1.0, v1 -// SICI: v_and_b32_e32 v0, -1.0, v1 ; encoding: [0xf3,0x02,0x00,0x36] -// GFX89: v_and_b32_e32 v0, -1.0, v1 ; encoding: [0xf3,0x02,0x00,0x26] -// GFX12XX: v_and_b32_e32 v0, -1.0, v1 ; encoding: [0xf3,0x02,0x00,0x36] // GFX11: v_and_b32_e32 v0, -1.0, v1 ; encoding: [0xf3,0x02,0x00,0x36] +// GFX12XX: v_and_b32_e32 v0, -1.0, v1 ; encoding: [0xf3,0x02,0x00,0x36] +// GFX89: v_and_b32_e32 v0, -1.0, v1 ; encoding: [0xf3,0x02,0x00,0x26] +// SICI: v_and_b32_e32 v0, -1.0, v1 ; encoding: [0xf3,0x02,0x00,0x36] v_and_b32_e64 v0, -1.0, v1 -// SICI: v_and_b32_e64 v0, -1.0, v1 ; encoding: [0x00,0x00,0x36,0xd2,0xf3,0x02,0x02,0x00] -// GFX89: v_and_b32_e64 v0, -1.0, v1 ; encoding: [0x00,0x00,0x13,0xd1,0xf3,0x02,0x02,0x00] -// GFX12XX: v_and_b32_e64 v0, -1.0, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xf3,0x02,0x02,0x00] // GFX11: v_and_b32_e64 v0, -1.0, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xf3,0x02,0x02,0x00] +// GFX12XX: v_and_b32_e64 v0, -1.0, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xf3,0x02,0x02,0x00] +// GFX89: v_and_b32_e64 v0, -1.0, v1 ; encoding: [0x00,0x00,0x13,0xd1,0xf3,0x02,0x02,0x00] +// SICI: v_and_b32_e64 v0, -1.0, v1 ; encoding: [0x00,0x00,0x36,0xd2,0xf3,0x02,0x02,0x00] s_mov_b64_e32 s[0:1], 4.0 // GFX8PLUS: s_mov_b64 s[0:1], 4.0 ; encoding: [0xf6,0x01,0x80,0xbe] // SICI: s_mov_b64 s[0:1], 4.0 ; encoding: [0xf6,0x04,0x80,0xbe] v_and_b32_e32 v0, 4.0, v1 -// SICI: v_and_b32_e32 v0, 4.0, v1 ; encoding: [0xf6,0x02,0x00,0x36] -// GFX89: v_and_b32_e32 v0, 4.0, v1 ; encoding: [0xf6,0x02,0x00,0x26] -// GFX12XX: v_and_b32_e32 v0, 4.0, v1 ; encoding: [0xf6,0x02,0x00,0x36] // GFX11: v_and_b32_e32 v0, 4.0, v1 ; encoding: [0xf6,0x02,0x00,0x36] +// GFX12XX: v_and_b32_e32 v0, 4.0, v1 ; encoding: [0xf6,0x02,0x00,0x36] +// GFX89: v_and_b32_e32 v0, 4.0, v1 ; encoding: [0xf6,0x02,0x00,0x26] +// SICI: v_and_b32_e32 v0, 4.0, v1 ; encoding: [0xf6,0x02,0x00,0x36] v_and_b32_e64 v0, 4.0, v1 -// SICI: v_and_b32_e64 v0, 4.0, v1 ; encoding: [0x00,0x00,0x36,0xd2,0xf6,0x02,0x02,0x00] -// GFX89: v_and_b32_e64 v0, 4.0, v1 ; encoding: [0x00,0x00,0x13,0xd1,0xf6,0x02,0x02,0x00] -// GFX12XX: v_and_b32_e64 v0, 4.0, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xf6,0x02,0x02,0x00] // GFX11: v_and_b32_e64 v0, 4.0, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xf6,0x02,0x02,0x00] +// GFX12XX: v_and_b32_e64 v0, 4.0, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xf6,0x02,0x02,0x00] +// GFX89: v_and_b32_e64 v0, 4.0, v1 ; encoding: [0x00,0x00,0x13,0xd1,0xf6,0x02,0x02,0x00] +// SICI: v_and_b32_e64 v0, 4.0, v1 ; encoding: [0x00,0x00,0x36,0xd2,0xf6,0x02,0x02,0x00] s_mov_b64_e32 s[0:1], 0.0 // GFX8PLUS: s_mov_b64 s[0:1], 0 ; encoding: [0x80,0x01,0x80,0xbe] // SICI: s_mov_b64 s[0:1], 0 ; encoding: [0x80,0x04,0x80,0xbe] v_and_b32_e32 v0, 0.0, v1 -// SICI: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x36] -// GFX89: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x26] -// GFX12XX: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x36] // GFX11: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x36] +// GFX12XX: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x36] +// GFX89: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x26] +// SICI: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x36] v_and_b32_e64 v0, 0.0, v1 -// SICI: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x36,0xd2,0x80,0x02,0x02,0x00] -// GFX89: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x13,0xd1,0x80,0x02,0x02,0x00] -// GFX12XX: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0x80,0x02,0x02,0x00] // GFX11: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0x80,0x02,0x02,0x00] +// GFX12XX: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0x80,0x02,0x02,0x00] +// GFX89: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x13,0xd1,0x80,0x02,0x02,0x00] +// SICI: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x36,0xd2,0x80,0x02,0x02,0x00] s_mov_b64_e32 s[0:1], 1.5 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction v_and_b32_e32 v0, 1.5, v1 -// SICI: v_and_b32_e32 v0, 0x3fc00000, v1 ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0xc0,0x3f] -// GFX89: v_and_b32_e32 v0, 0x3fc00000, v1 ; encoding: [0xff,0x02,0x00,0x26,0x00,0x00,0xc0,0x3f] -// GFX12XX: v_and_b32_e32 v0, 0x3fc00000, v1 ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0xc0,0x3f] // GFX11: v_and_b32_e32 v0, 0x3fc00000, v1 ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0xc0,0x3f] +// GFX12XX: v_and_b32_e32 v0, 0x3fc00000, v1 ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0xc0,0x3f] +// GFX89: v_and_b32_e32 v0, 0x3fc00000, v1 ; encoding: [0xff,0x02,0x00,0x26,0x00,0x00,0xc0,0x3f] +// SICI: v_and_b32_e32 v0, 0x3fc00000, v1 ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0xc0,0x3f] s_mov_b64_e32 s[0:1], -3.1415 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction v_and_b32_e32 v0, -3.1415, v1 -// SICI: v_and_b32_e32 v0, 0xc0490e56, v1 ; encoding: [0xff,0x02,0x00,0x36,0x56,0x0e,0x49,0xc0] -// GFX89: v_and_b32_e32 v0, 0xc0490e56, v1 ; encoding: [0xff,0x02,0x00,0x26,0x56,0x0e,0x49,0xc0] -// GFX12XX: v_and_b32_e32 v0, 0xc0490e56, v1 ; encoding: [0xff,0x02,0x00,0x36,0x56,0x0e,0x49,0xc0] // GFX11: v_and_b32_e32 v0, 0xc0490e56, v1 ; encoding: [0xff,0x02,0x00,0x36,0x56,0x0e,0x49,0xc0] +// GFX12XX: v_and_b32_e32 v0, 0xc0490e56, v1 ; encoding: [0xff,0x02,0x00,0x36,0x56,0x0e,0x49,0xc0] +// GFX89: v_and_b32_e32 v0, 0xc0490e56, v1 ; encoding: [0xff,0x02,0x00,0x26,0x56,0x0e,0x49,0xc0] +// SICI: v_and_b32_e32 v0, 0xc0490e56, v1 ; encoding: [0xff,0x02,0x00,0x36,0x56,0x0e,0x49,0xc0] s_mov_b64_e32 s[0:1], 100000000000000000000000.0 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction v_and_b32_e32 v0, 100000000000000000000000.0, v1 -// SICI: v_and_b32_e32 v0, 0x65a96816, v1 ; encoding: [0xff,0x02,0x00,0x36,0x16,0x68,0xa9,0x65] -// GFX89: v_and_b32_e32 v0, 0x65a96816, v1 ; encoding: [0xff,0x02,0x00,0x26,0x16,0x68,0xa9,0x65] -// GFX12XX: v_and_b32_e32 v0, 0x65a96816, v1 ; encoding: [0xff,0x02,0x00,0x36,0x16,0x68,0xa9,0x65] // GFX11: v_and_b32_e32 v0, 0x65a96816, v1 ; encoding: [0xff,0x02,0x00,0x36,0x16,0x68,0xa9,0x65] +// GFX12XX: v_and_b32_e32 v0, 0x65a96816, v1 ; encoding: [0xff,0x02,0x00,0x36,0x16,0x68,0xa9,0x65] +// GFX89: v_and_b32_e32 v0, 0x65a96816, v1 ; encoding: [0xff,0x02,0x00,0x26,0x16,0x68,0xa9,0x65] +// SICI: v_and_b32_e32 v0, 0x65a96816, v1 ; encoding: [0xff,0x02,0x00,0x36,0x16,0x68,0xa9,0x65] s_mov_b64_e32 s[0:1], 10000000.0 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction v_and_b32_e32 v0, 10000000.0, v1 -// SICI: v_and_b32_e32 v0, 0x4b189680, v1 ; encoding: [0xff,0x02,0x00,0x36,0x80,0x96,0x18,0x4b] -// GFX89: v_and_b32_e32 v0, 0x4b189680, v1 ; encoding: [0xff,0x02,0x00,0x26,0x80,0x96,0x18,0x4b] -// GFX12XX: v_and_b32_e32 v0, 0x4b189680, v1 ; encoding: [0xff,0x02,0x00,0x36,0x80,0x96,0x18,0x4b] // GFX11: v_and_b32_e32 v0, 0x4b189680, v1 ; encoding: [0xff,0x02,0x00,0x36,0x80,0x96,0x18,0x4b] +// GFX12XX: v_and_b32_e32 v0, 0x4b189680, v1 ; encoding: [0xff,0x02,0x00,0x36,0x80,0x96,0x18,0x4b] +// GFX89: v_and_b32_e32 v0, 0x4b189680, v1 ; encoding: [0xff,0x02,0x00,0x26,0x80,0x96,0x18,0x4b] +// SICI: v_and_b32_e32 v0, 0x4b189680, v1 ; encoding: [0xff,0x02,0x00,0x36,0x80,0x96,0x18,0x4b] s_mov_b64_e32 s[0:1], 3.402823e+38 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction v_and_b32_e32 v0, 3.402823e+38, v1 -// SICI: v_and_b32_e32 v0, 0x7f7ffffd, v1 ; encoding: [0xff,0x02,0x00,0x36,0xfd,0xff,0x7f,0x7f] -// GFX89: v_and_b32_e32 v0, 0x7f7ffffd, v1 ; encoding: [0xff,0x02,0x00,0x26,0xfd,0xff,0x7f,0x7f] -// GFX12XX: v_and_b32_e32 v0, 0x7f7ffffd, v1 ; encoding: [0xff,0x02,0x00,0x36,0xfd,0xff,0x7f,0x7f] // GFX11: v_and_b32_e32 v0, 0x7f7ffffd, v1 ; encoding: [0xff,0x02,0x00,0x36,0xfd,0xff,0x7f,0x7f] +// GFX12XX: v_and_b32_e32 v0, 0x7f7ffffd, v1 ; encoding: [0xff,0x02,0x00,0x36,0xfd,0xff,0x7f,0x7f] +// GFX89: v_and_b32_e32 v0, 0x7f7ffffd, v1 ; encoding: [0xff,0x02,0x00,0x26,0xfd,0xff,0x7f,0x7f] +// SICI: v_and_b32_e32 v0, 0x7f7ffffd, v1 ; encoding: [0xff,0x02,0x00,0x36,0xfd,0xff,0x7f,0x7f] s_mov_b64_e32 s[0:1], 2.3509886e-38 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction v_and_b32_e32 v0, 2.3509886e-38, v1 -// SICI: v_and_b32_e32 v0, 0xffffff, v1 ; encoding: [0xff,0x02,0x00,0x36,0xff,0xff,0xff,0x00] -// GFX89: v_and_b32_e32 v0, 0xffffff, v1 ; encoding: [0xff,0x02,0x00,0x26,0xff,0xff,0xff,0x00] -// GFX12XX: v_and_b32_e32 v0, 0xffffff, v1 ; encoding: [0xff,0x02,0x00,0x36,0xff,0xff,0xff,0x00] // GFX11: v_and_b32_e32 v0, 0xffffff, v1 ; encoding: [0xff,0x02,0x00,0x36,0xff,0xff,0xff,0x00] +// GFX12XX: v_and_b32_e32 v0, 0xffffff, v1 ; encoding: [0xff,0x02,0x00,0x36,0xff,0xff,0xff,0x00] +// GFX89: v_and_b32_e32 v0, 0xffffff, v1 ; encoding: [0xff,0x02,0x00,0x26,0xff,0xff,0xff,0x00] +// SICI: v_and_b32_e32 v0, 0xffffff, v1 ; encoding: [0xff,0x02,0x00,0x36,0xff,0xff,0xff,0x00] s_mov_b64_e32 s[0:1], 2.3509886e-70 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction @@ -429,322 +429,322 @@ v_and_b32_e32 v0, 2.3509886e-70, v1 // NOGCN: :[[@LINE-1]]:19: error: invalid operand for instruction v_not_b16 v5.l, 1.0 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_not_b16_e32 v5.l, 1.0 ; encoding: [0xf2,0xd2,0x0a,0x7e] // GFX1250: v_not_b16_e32 v5.l, 1.0 ; encoding: [0xf2,0xd2,0x0a,0x7e] -// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode +// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_not_b16 v5.l, lit(1.0) -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_not_b16_e32 v5.l, lit(0x3f800000) ; encoding: [0xff,0xd2,0x0a,0x7e,0x00,0x00,0x80,0x3f] // GFX1250: v_not_b16_e32 v5.l, lit(0x3f800000) ; encoding: [0xff,0xd2,0x0a,0x7e,0x00,0x00,0x80,0x3f] -// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode +// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_and_b32_e32 v0, 1.0, v1 -// SICI: v_and_b32_e32 v0, 1.0, v1 ; encoding: [0xf2,0x02,0x00,0x36] -// GFX89: v_and_b32_e32 v0, 1.0, v1 ; encoding: [0xf2,0x02,0x00,0x26] -// GFX12XX: v_and_b32_e32 v0, 1.0, v1 ; encoding: [0xf2,0x02,0x00,0x36] // GFX11: v_and_b32_e32 v0, 1.0, v1 ; encoding: [0xf2,0x02,0x00,0x36] +// GFX12XX: v_and_b32_e32 v0, 1.0, v1 ; encoding: [0xf2,0x02,0x00,0x36] +// GFX89: v_and_b32_e32 v0, 1.0, v1 ; encoding: [0xf2,0x02,0x00,0x26] +// SICI: v_and_b32_e32 v0, 1.0, v1 ; encoding: [0xf2,0x02,0x00,0x36] v_and_b32_e32 v0, lit(1.0), v1 -// SICI: v_and_b32_e32 v0, lit(0x3f800000), v1 ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0x80,0x3f] -// GFX89: v_and_b32_e32 v0, lit(0x3f800000), v1 ; encoding: [0xff,0x02,0x00,0x26,0x00,0x00,0x80,0x3f] -// GFX12XX: v_and_b32_e32 v0, lit(0x3f800000), v1 ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0x80,0x3f] // GFX11: v_and_b32_e32 v0, lit(0x3f800000), v1 ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0x80,0x3f] +// GFX12XX: v_and_b32_e32 v0, lit(0x3f800000), v1 ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0x80,0x3f] +// GFX89: v_and_b32_e32 v0, lit(0x3f800000), v1 ; encoding: [0xff,0x02,0x00,0x26,0x00,0x00,0x80,0x3f] +// SICI: v_and_b32_e32 v0, lit(0x3f800000), v1 ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0x80,0x3f] v_pk_add_u16 v5, exec_lo, 1.0 +// GFX11: v_pk_add_u16 v5, exec_lo, 1.0 ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xe4,0x01,0x18] // GFX12XX: v_pk_add_u16 v5, exec_lo, 1.0 ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xe4,0x01,0x18] -// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX9: v_pk_add_u16 v5, exec_lo, 1.0 ; encoding: [0x05,0x40,0x8a,0xd3,0x7e,0xe4,0x01,0x18] -// GFX11: v_pk_add_u16 v5, exec_lo, 1.0 ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xe4,0x01,0x18] +// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU // NOVI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_pk_add_u16 v5, exec_lo, lit(1.0) -// GFX12XX: v_pk_add_u16 v5, exec_lo, lit(0x3f800000) ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x18,0x00,0x00,0x80,0x3f] -// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_pk_add_u16 v5, exec_lo, lit(0x3f800000) ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x18,0x00,0x00,0x80,0x3f] -// NOVI: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX9: :[[@LINE-5]]:31: error: invalid operand (violates constant bus restrictions) +// GFX12XX: v_pk_add_u16 v5, exec_lo, lit(0x3f800000) ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x18,0x00,0x00,0x80,0x3f] +// NOGFX9: :[[@LINE-3]]:31: error: invalid operand (violates constant bus restrictions) +// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 1.0 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX1250: v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 1.0 ; encoding: [0x02,0x00,0x42,0xd6,0x04,0x09,0xca,0x03] -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], lit(1.0) -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX1250: v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], lit(0x3f800000) ; encoding: [0x02,0x00,0x42,0xd6,0x04,0x09,0xfe,0x03,0x00,0x00,0x80,0x3f] -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU //---------------------------------------------------------------------------// // int literal, expected fp operand //---------------------------------------------------------------------------// v_trunc_f32_e32 v0, 0 -// SICI: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x42,0x00,0x7e] -// GFX89: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x38,0x00,0x7e] -// GFX12XX: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x42,0x00,0x7e] // GFX11: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x42,0x00,0x7e] +// GFX12XX: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x42,0x00,0x7e] +// GFX89: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x38,0x00,0x7e] +// SICI: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x42,0x00,0x7e] v_fract_f64_e32 v[0:1], 1 -// SICI: v_fract_f64_e32 v[0:1], 1 ; encoding: [0x81,0x7c,0x00,0x7e] -// GFX89: v_fract_f64_e32 v[0:1], 1 ; encoding: [0x81,0x64,0x00,0x7e] -// GFX12XX: v_fract_f64_e32 v[0:1], 1 ; encoding: [0x81,0x7c,0x00,0x7e] // GFX11: v_fract_f64_e32 v[0:1], 1 ; encoding: [0x81,0x7c,0x00,0x7e] +// GFX12XX: v_fract_f64_e32 v[0:1], 1 ; encoding: [0x81,0x7c,0x00,0x7e] +// GFX89: v_fract_f64_e32 v[0:1], 1 ; encoding: [0x81,0x64,0x00,0x7e] +// SICI: v_fract_f64_e32 v[0:1], 1 ; encoding: [0x81,0x7c,0x00,0x7e] v_fract_f64_e32 v[0:1], lit(1) -// SICI: v_fract_f64_e32 v[0:1], lit(0x1) ; encoding: [0xff,0x7c,0x00,0x7e,0x01,0x00,0x00,0x00] -// GFX89: v_fract_f64_e32 v[0:1], lit(0x1) ; encoding: [0xff,0x64,0x00,0x7e,0x01,0x00,0x00,0x00] // GFX11: v_fract_f64_e32 v[0:1], lit(0x1) ; encoding: [0xff,0x7c,0x00,0x7e,0x01,0x00,0x00,0x00] // GFX12: v_fract_f64_e32 v[0:1], lit(0x1) ; encoding: [0xff,0x7c,0x00,0x7e,0x01,0x00,0x00,0x00] // GFX1250: v_fract_f64_e32 v[0:1], lit(0x1) ; encoding: [0xfe,0x7c,0x00,0x7e,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00] +// GFX89: v_fract_f64_e32 v[0:1], lit(0x1) ; encoding: [0xff,0x64,0x00,0x7e,0x01,0x00,0x00,0x00] +// SICI: v_fract_f64_e32 v[0:1], lit(0x1) ; encoding: [0xff,0x7c,0x00,0x7e,0x01,0x00,0x00,0x00] v_trunc_f32_e64 v0, 0 -// SICI: v_trunc_f32_e64 v0, 0 ; encoding: [0x00,0x00,0x42,0xd3,0x80,0x00,0x00,0x00] -// GFX89: v_trunc_f32_e64 v0, 0 ; encoding: [0x00,0x00,0x5c,0xd1,0x80,0x00,0x00,0x00] -// GFX12XX: v_trunc_f32_e64 v0, 0 ; encoding: [0x00,0x00,0xa1,0xd5,0x80,0x00,0x00,0x00] // GFX11: v_trunc_f32_e64 v0, 0 ; encoding: [0x00,0x00,0xa1,0xd5,0x80,0x00,0x00,0x00] +// GFX12XX: v_trunc_f32_e64 v0, 0 ; encoding: [0x00,0x00,0xa1,0xd5,0x80,0x00,0x00,0x00] +// GFX89: v_trunc_f32_e64 v0, 0 ; encoding: [0x00,0x00,0x5c,0xd1,0x80,0x00,0x00,0x00] +// SICI: v_trunc_f32_e64 v0, 0 ; encoding: [0x00,0x00,0x42,0xd3,0x80,0x00,0x00,0x00] v_fract_f64_e64 v[0:1], 0 -// SICI: v_fract_f64_e64 v[0:1], 0 ; encoding: [0x00,0x00,0x7c,0xd3,0x80,0x00,0x00,0x00] -// GFX89: v_fract_f64_e64 v[0:1], 0 ; encoding: [0x00,0x00,0x72,0xd1,0x80,0x00,0x00,0x00] -// GFX12XX: v_fract_f64_e64 v[0:1], 0 ; encoding: [0x00,0x00,0xbe,0xd5,0x80,0x00,0x00,0x00] // GFX11: v_fract_f64_e64 v[0:1], 0 ; encoding: [0x00,0x00,0xbe,0xd5,0x80,0x00,0x00,0x00] +// GFX12XX: v_fract_f64_e64 v[0:1], 0 ; encoding: [0x00,0x00,0xbe,0xd5,0x80,0x00,0x00,0x00] +// GFX89: v_fract_f64_e64 v[0:1], 0 ; encoding: [0x00,0x00,0x72,0xd1,0x80,0x00,0x00,0x00] +// SICI: v_fract_f64_e64 v[0:1], 0 ; encoding: [0x00,0x00,0x7c,0xd3,0x80,0x00,0x00,0x00] v_trunc_f32_e32 v0, -13 -// SICI: v_trunc_f32_e32 v0, -13 ; encoding: [0xcd,0x42,0x00,0x7e] -// GFX89: v_trunc_f32_e32 v0, -13 ; encoding: [0xcd,0x38,0x00,0x7e] -// GFX12XX: v_trunc_f32_e32 v0, -13 ; encoding: [0xcd,0x42,0x00,0x7e] // GFX11: v_trunc_f32_e32 v0, -13 ; encoding: [0xcd,0x42,0x00,0x7e] +// GFX12XX: v_trunc_f32_e32 v0, -13 ; encoding: [0xcd,0x42,0x00,0x7e] +// GFX89: v_trunc_f32_e32 v0, -13 ; encoding: [0xcd,0x38,0x00,0x7e] +// SICI: v_trunc_f32_e32 v0, -13 ; encoding: [0xcd,0x42,0x00,0x7e] v_fract_f64_e32 v[0:1], -13 -// SICI: v_fract_f64_e32 v[0:1], -13 ; encoding: [0xcd,0x7c,0x00,0x7e] -// GFX89: v_fract_f64_e32 v[0:1], -13 ; encoding: [0xcd,0x64,0x00,0x7e] -// GFX12XX: v_fract_f64_e32 v[0:1], -13 ; encoding: [0xcd,0x7c,0x00,0x7e] // GFX11: v_fract_f64_e32 v[0:1], -13 ; encoding: [0xcd,0x7c,0x00,0x7e] +// GFX12XX: v_fract_f64_e32 v[0:1], -13 ; encoding: [0xcd,0x7c,0x00,0x7e] +// GFX89: v_fract_f64_e32 v[0:1], -13 ; encoding: [0xcd,0x64,0x00,0x7e] +// SICI: v_fract_f64_e32 v[0:1], -13 ; encoding: [0xcd,0x7c,0x00,0x7e] v_trunc_f32_e64 v0, -13 -// SICI: v_trunc_f32_e64 v0, -13 ; encoding: [0x00,0x00,0x42,0xd3,0xcd,0x00,0x00,0x00] -// GFX89: v_trunc_f32_e64 v0, -13 ; encoding: [0x00,0x00,0x5c,0xd1,0xcd,0x00,0x00,0x00] -// GFX12XX: v_trunc_f32_e64 v0, -13 ; encoding: [0x00,0x00,0xa1,0xd5,0xcd,0x00,0x00,0x00] // GFX11: v_trunc_f32_e64 v0, -13 ; encoding: [0x00,0x00,0xa1,0xd5,0xcd,0x00,0x00,0x00] +// GFX12XX: v_trunc_f32_e64 v0, -13 ; encoding: [0x00,0x00,0xa1,0xd5,0xcd,0x00,0x00,0x00] +// GFX89: v_trunc_f32_e64 v0, -13 ; encoding: [0x00,0x00,0x5c,0xd1,0xcd,0x00,0x00,0x00] +// SICI: v_trunc_f32_e64 v0, -13 ; encoding: [0x00,0x00,0x42,0xd3,0xcd,0x00,0x00,0x00] v_fract_f64_e64 v[0:1], -13 -// SICI: v_fract_f64_e64 v[0:1], -13 ; encoding: [0x00,0x00,0x7c,0xd3,0xcd,0x00,0x00,0x00] -// GFX89: v_fract_f64_e64 v[0:1], -13 ; encoding: [0x00,0x00,0x72,0xd1,0xcd,0x00,0x00,0x00] -// GFX12XX: v_fract_f64_e64 v[0:1], -13 ; encoding: [0x00,0x00,0xbe,0xd5,0xcd,0x00,0x00,0x00] // GFX11: v_fract_f64_e64 v[0:1], -13 ; encoding: [0x00,0x00,0xbe,0xd5,0xcd,0x00,0x00,0x00] +// GFX12XX: v_fract_f64_e64 v[0:1], -13 ; encoding: [0x00,0x00,0xbe,0xd5,0xcd,0x00,0x00,0x00] +// GFX89: v_fract_f64_e64 v[0:1], -13 ; encoding: [0x00,0x00,0x72,0xd1,0xcd,0x00,0x00,0x00] +// SICI: v_fract_f64_e64 v[0:1], -13 ; encoding: [0x00,0x00,0x7c,0xd3,0xcd,0x00,0x00,0x00] v_trunc_f32_e32 v0, 35 -// SICI: v_trunc_f32_e32 v0, 35 ; encoding: [0xa3,0x42,0x00,0x7e] -// GFX89: v_trunc_f32_e32 v0, 35 ; encoding: [0xa3,0x38,0x00,0x7e] -// GFX12XX: v_trunc_f32_e32 v0, 35 ; encoding: [0xa3,0x42,0x00,0x7e] // GFX11: v_trunc_f32_e32 v0, 35 ; encoding: [0xa3,0x42,0x00,0x7e] +// GFX12XX: v_trunc_f32_e32 v0, 35 ; encoding: [0xa3,0x42,0x00,0x7e] +// GFX89: v_trunc_f32_e32 v0, 35 ; encoding: [0xa3,0x38,0x00,0x7e] +// SICI: v_trunc_f32_e32 v0, 35 ; encoding: [0xa3,0x42,0x00,0x7e] v_fract_f64_e32 v[0:1], 35 -// SICI: v_fract_f64_e32 v[0:1], 35 ; encoding: [0xa3,0x7c,0x00,0x7e] -// GFX89: v_fract_f64_e32 v[0:1], 35 ; encoding: [0xa3,0x64,0x00,0x7e] -// GFX12XX: v_fract_f64_e32 v[0:1], 35 ; encoding: [0xa3,0x7c,0x00,0x7e] // GFX11: v_fract_f64_e32 v[0:1], 35 ; encoding: [0xa3,0x7c,0x00,0x7e] +// GFX12XX: v_fract_f64_e32 v[0:1], 35 ; encoding: [0xa3,0x7c,0x00,0x7e] +// GFX89: v_fract_f64_e32 v[0:1], 35 ; encoding: [0xa3,0x64,0x00,0x7e] +// SICI: v_fract_f64_e32 v[0:1], 35 ; encoding: [0xa3,0x7c,0x00,0x7e] v_trunc_f32_e64 v0, 35 -// SICI: v_trunc_f32_e64 v0, 35 ; encoding: [0x00,0x00,0x42,0xd3,0xa3,0x00,0x00,0x00] -// GFX89: v_trunc_f32_e64 v0, 35 ; encoding: [0x00,0x00,0x5c,0xd1,0xa3,0x00,0x00,0x00] -// GFX12XX: v_trunc_f32_e64 v0, 35 ; encoding: [0x00,0x00,0xa1,0xd5,0xa3,0x00,0x00,0x00] // GFX11: v_trunc_f32_e64 v0, 35 ; encoding: [0x00,0x00,0xa1,0xd5,0xa3,0x00,0x00,0x00] +// GFX12XX: v_trunc_f32_e64 v0, 35 ; encoding: [0x00,0x00,0xa1,0xd5,0xa3,0x00,0x00,0x00] +// GFX89: v_trunc_f32_e64 v0, 35 ; encoding: [0x00,0x00,0x5c,0xd1,0xa3,0x00,0x00,0x00] +// SICI: v_trunc_f32_e64 v0, 35 ; encoding: [0x00,0x00,0x42,0xd3,0xa3,0x00,0x00,0x00] v_fract_f64_e64 v[0:1], 35 -// SICI: v_fract_f64_e64 v[0:1], 35 ; encoding: [0x00,0x00,0x7c,0xd3,0xa3,0x00,0x00,0x00] -// GFX89: v_fract_f64_e64 v[0:1], 35 ; encoding: [0x00,0x00,0x72,0xd1,0xa3,0x00,0x00,0x00] -// GFX12XX: v_fract_f64_e64 v[0:1], 35 ; encoding: [0x00,0x00,0xbe,0xd5,0xa3,0x00,0x00,0x00] // GFX11: v_fract_f64_e64 v[0:1], 35 ; encoding: [0x00,0x00,0xbe,0xd5,0xa3,0x00,0x00,0x00] +// GFX12XX: v_fract_f64_e64 v[0:1], 35 ; encoding: [0x00,0x00,0xbe,0xd5,0xa3,0x00,0x00,0x00] +// GFX89: v_fract_f64_e64 v[0:1], 35 ; encoding: [0x00,0x00,0x72,0xd1,0xa3,0x00,0x00,0x00] +// SICI: v_fract_f64_e64 v[0:1], 35 ; encoding: [0x00,0x00,0x7c,0xd3,0xa3,0x00,0x00,0x00] v_trunc_f32_e32 v0, 1234 -// SICI: v_trunc_f32_e32 v0, 0x4d2 ; encoding: [0xff,0x42,0x00,0x7e,0xd2,0x04,0x00,0x00] -// GFX89: v_trunc_f32_e32 v0, 0x4d2 ; encoding: [0xff,0x38,0x00,0x7e,0xd2,0x04,0x00,0x00] -// GFX12XX: v_trunc_f32_e32 v0, 0x4d2 ; encoding: [0xff,0x42,0x00,0x7e,0xd2,0x04,0x00,0x00] // GFX11: v_trunc_f32_e32 v0, 0x4d2 ; encoding: [0xff,0x42,0x00,0x7e,0xd2,0x04,0x00,0x00] +// GFX12XX: v_trunc_f32_e32 v0, 0x4d2 ; encoding: [0xff,0x42,0x00,0x7e,0xd2,0x04,0x00,0x00] +// GFX89: v_trunc_f32_e32 v0, 0x4d2 ; encoding: [0xff,0x38,0x00,0x7e,0xd2,0x04,0x00,0x00] +// SICI: v_trunc_f32_e32 v0, 0x4d2 ; encoding: [0xff,0x42,0x00,0x7e,0xd2,0x04,0x00,0x00] v_fract_f64_e32 v[0:1], 1234 -// SICI: v_fract_f64_e32 v[0:1], 0x4d2 ; encoding: [0xff,0x7c,0x00,0x7e,0xd2,0x04,0x00,0x00] -// GFX89: v_fract_f64_e32 v[0:1], 0x4d2 ; encoding: [0xff,0x64,0x00,0x7e,0xd2,0x04,0x00,0x00] -// GFX12XX: v_fract_f64_e32 v[0:1], 0x4d2 ; encoding: [0xff,0x7c,0x00,0x7e,0xd2,0x04,0x00,0x00] // GFX11: v_fract_f64_e32 v[0:1], 0x4d2 ; encoding: [0xff,0x7c,0x00,0x7e,0xd2,0x04,0x00,0x00] +// GFX12XX: v_fract_f64_e32 v[0:1], 0x4d2 ; encoding: [0xff,0x7c,0x00,0x7e,0xd2,0x04,0x00,0x00] +// GFX89: v_fract_f64_e32 v[0:1], 0x4d2 ; encoding: [0xff,0x64,0x00,0x7e,0xd2,0x04,0x00,0x00] +// SICI: v_fract_f64_e32 v[0:1], 0x4d2 ; encoding: [0xff,0x7c,0x00,0x7e,0xd2,0x04,0x00,0x00] v_trunc_f32_e64 v0, 1234 +// GFX11: v_trunc_f32_e64 v0, 0x4d2 ; encoding: [0x00,0x00,0xa1,0xd5,0xff,0x00,0x00,0x00,0xd2,0x04,0x00,0x00] // GFX12XX: v_trunc_f32_e64 v0, 0x4d2 ; encoding: [0x00,0x00,0xa1,0xd5,0xff,0x00,0x00,0x00,0xd2,0x04,0x00,0x00] -// NOSICI: :[[@LINE-2]]:21: error: literal operands are not supported // NOGFX89: :[[@LINE-3]]:21: error: literal operands are not supported -// GFX11: v_trunc_f32_e64 v0, 0x4d2 ; encoding: [0x00,0x00,0xa1,0xd5,0xff,0x00,0x00,0x00,0xd2,0x04,0x00,0x00] +// NOSICI: :[[@LINE-4]]:21: error: literal operands are not supported // NOSICIVI: :[[@LINE-1]]:21: error: literal operands are not supported v_fract_f64_e64 v[0:1], 1234 +// GFX11: v_fract_f64_e64 v[0:1], 0x4d2 ; encoding: [0x00,0x00,0xbe,0xd5,0xff,0x00,0x00,0x00,0xd2,0x04,0x00,0x00] // GFX12XX: v_fract_f64_e64 v[0:1], 0x4d2 ; encoding: [0x00,0x00,0xbe,0xd5,0xff,0x00,0x00,0x00,0xd2,0x04,0x00,0x00] -// NOSICI: :[[@LINE-2]]:25: error: literal operands are not supported // NOGFX89: :[[@LINE-3]]:25: error: literal operands are not supported -// GFX11: v_fract_f64_e64 v[0:1], 0x4d2 ; encoding: [0x00,0x00,0xbe,0xd5,0xff,0x00,0x00,0x00,0xd2,0x04,0x00,0x00] +// NOSICI: :[[@LINE-4]]:25: error: literal operands are not supported // NOSICIVI: :[[@LINE-1]]:25: error: literal operands are not supported v_trunc_f32_e32 v0, -54321 -// SICI: v_trunc_f32_e32 v0, 0xffff2bcf ; encoding: [0xff,0x42,0x00,0x7e,0xcf,0x2b,0xff,0xff] -// GFX89: v_trunc_f32_e32 v0, 0xffff2bcf ; encoding: [0xff,0x38,0x00,0x7e,0xcf,0x2b,0xff,0xff] -// GFX12XX: v_trunc_f32_e32 v0, 0xffff2bcf ; encoding: [0xff,0x42,0x00,0x7e,0xcf,0x2b,0xff,0xff] // GFX11: v_trunc_f32_e32 v0, 0xffff2bcf ; encoding: [0xff,0x42,0x00,0x7e,0xcf,0x2b,0xff,0xff] +// GFX12XX: v_trunc_f32_e32 v0, 0xffff2bcf ; encoding: [0xff,0x42,0x00,0x7e,0xcf,0x2b,0xff,0xff] +// GFX89: v_trunc_f32_e32 v0, 0xffff2bcf ; encoding: [0xff,0x38,0x00,0x7e,0xcf,0x2b,0xff,0xff] +// SICI: v_trunc_f32_e32 v0, 0xffff2bcf ; encoding: [0xff,0x42,0x00,0x7e,0xcf,0x2b,0xff,0xff] v_fract_f64_e32 v[0:1], -54321 -// SICI: v_fract_f64_e32 v[0:1], 0xffff2bcf ; encoding: [0xff,0x7c,0x00,0x7e,0xcf,0x2b,0xff,0xff] -// GFX89: v_fract_f64_e32 v[0:1], 0xffff2bcf ; encoding: [0xff,0x64,0x00,0x7e,0xcf,0x2b,0xff,0xff] -// GFX12XX: v_fract_f64_e32 v[0:1], 0xffff2bcf ; encoding: [0xff,0x7c,0x00,0x7e,0xcf,0x2b,0xff,0xff] // GFX11: v_fract_f64_e32 v[0:1], 0xffff2bcf ; encoding: [0xff,0x7c,0x00,0x7e,0xcf,0x2b,0xff,0xff] +// GFX12XX: v_fract_f64_e32 v[0:1], 0xffff2bcf ; encoding: [0xff,0x7c,0x00,0x7e,0xcf,0x2b,0xff,0xff] +// GFX89: v_fract_f64_e32 v[0:1], 0xffff2bcf ; encoding: [0xff,0x64,0x00,0x7e,0xcf,0x2b,0xff,0xff] +// SICI: v_fract_f64_e32 v[0:1], 0xffff2bcf ; encoding: [0xff,0x7c,0x00,0x7e,0xcf,0x2b,0xff,0xff] v_trunc_f32_e32 v0, 0xdeadbeef -// SICI: v_trunc_f32_e32 v0, 0xdeadbeef ; encoding: [0xff,0x42,0x00,0x7e,0xef,0xbe,0xad,0xde] -// GFX89: v_trunc_f32_e32 v0, 0xdeadbeef ; encoding: [0xff,0x38,0x00,0x7e,0xef,0xbe,0xad,0xde] -// GFX12XX: v_trunc_f32_e32 v0, 0xdeadbeef ; encoding: [0xff,0x42,0x00,0x7e,0xef,0xbe,0xad,0xde] // GFX11: v_trunc_f32_e32 v0, 0xdeadbeef ; encoding: [0xff,0x42,0x00,0x7e,0xef,0xbe,0xad,0xde] +// GFX12XX: v_trunc_f32_e32 v0, 0xdeadbeef ; encoding: [0xff,0x42,0x00,0x7e,0xef,0xbe,0xad,0xde] +// GFX89: v_trunc_f32_e32 v0, 0xdeadbeef ; encoding: [0xff,0x38,0x00,0x7e,0xef,0xbe,0xad,0xde] +// SICI: v_trunc_f32_e32 v0, 0xdeadbeef ; encoding: [0xff,0x42,0x00,0x7e,0xef,0xbe,0xad,0xde] v_fract_f64_e32 v[0:1], 0xdeadbeef -// SICI: v_fract_f64_e32 v[0:1], 0xdeadbeef ; encoding: [0xff,0x7c,0x00,0x7e,0xef,0xbe,0xad,0xde] -// GFX89: v_fract_f64_e32 v[0:1], 0xdeadbeef ; encoding: [0xff,0x64,0x00,0x7e,0xef,0xbe,0xad,0xde] -// GFX12XX: v_fract_f64_e32 v[0:1], 0xdeadbeef ; encoding: [0xff,0x7c,0x00,0x7e,0xef,0xbe,0xad,0xde] // GFX11: v_fract_f64_e32 v[0:1], 0xdeadbeef ; encoding: [0xff,0x7c,0x00,0x7e,0xef,0xbe,0xad,0xde] +// GFX12XX: v_fract_f64_e32 v[0:1], 0xdeadbeef ; encoding: [0xff,0x7c,0x00,0x7e,0xef,0xbe,0xad,0xde] +// GFX89: v_fract_f64_e32 v[0:1], 0xdeadbeef ; encoding: [0xff,0x64,0x00,0x7e,0xef,0xbe,0xad,0xde] +// SICI: v_fract_f64_e32 v[0:1], 0xdeadbeef ; encoding: [0xff,0x7c,0x00,0x7e,0xef,0xbe,0xad,0xde] v_trunc_f32_e32 v0, 0xffffffff -// SICI: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x42,0x00,0x7e] -// GFX89: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x38,0x00,0x7e] -// GFX12XX: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x42,0x00,0x7e] // GFX11: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x42,0x00,0x7e] +// GFX12XX: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x42,0x00,0x7e] +// GFX89: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x38,0x00,0x7e] +// SICI: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x42,0x00,0x7e] v_fract_f64_e32 v[0:1], 0xffffffff -// SICI: v_fract_f64_e32 v[0:1], 0xffffffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xff,0xff] -// GFX89: v_fract_f64_e32 v[0:1], 0xffffffff ; encoding: [0xff,0x64,0x00,0x7e,0xff,0xff,0xff,0xff] -// GFX12XX: v_fract_f64_e32 v[0:1], 0xffffffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xff,0xff] // GFX11: v_fract_f64_e32 v[0:1], 0xffffffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xff,0xff] +// GFX12XX: v_fract_f64_e32 v[0:1], 0xffffffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xff,0xff] +// GFX89: v_fract_f64_e32 v[0:1], 0xffffffff ; encoding: [0xff,0x64,0x00,0x7e,0xff,0xff,0xff,0xff] +// SICI: v_fract_f64_e32 v[0:1], 0xffffffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xff,0xff] v_trunc_f32_e32 v0, 0x123456789abcdef0 // NOGCN: :[[@LINE-1]]:21: error: invalid operand for instruction v_fract_f64_e32 v[0:1], 0x123456789abcdef0 -// NOSICI: :[[@LINE-1]]:25: error: invalid operand for instruction -// NOGFX89: :[[@LINE-2]]:25: error: invalid operand for instruction // GFX1250: v_fract_f64_e32 v[0:1], 0x123456789abcdef0 ; encoding: [0xfe,0x7c,0x00,0x7e,0xf0,0xde,0xbc,0x9a,0x78,0x56,0x34,0x12] -// NOGFX11: :[[@LINE-4]]:25: error: invalid operand for instruction -// NOGFX12: :[[@LINE-5]]:25: error: invalid operand for instruction +// NOGFX11: :[[@LINE-2]]:25: error: invalid operand for instruction +// NOGFX12: :[[@LINE-3]]:25: error: invalid operand for instruction +// NOGFX89: :[[@LINE-4]]:25: error: invalid operand for instruction +// NOSICI: :[[@LINE-5]]:25: error: invalid operand for instruction // NOSICIVI: :[[@LINE-1]]:25: error: invalid operand for instruction v_trunc_f32_e32 v0, 0xffffffffffffffff -// SICI: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x42,0x00,0x7e] -// GFX89: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x38,0x00,0x7e] -// GFX12XX: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x42,0x00,0x7e] // GFX11: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x42,0x00,0x7e] +// GFX12XX: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x42,0x00,0x7e] +// GFX89: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x38,0x00,0x7e] +// SICI: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x42,0x00,0x7e] v_fract_f64_e32 v[0:1], 0xffffffffffffffff -// SICI: v_fract_f64_e32 v[0:1], -1 ; encoding: [0xc1,0x7c,0x00,0x7e] -// GFX89: v_fract_f64_e32 v[0:1], -1 ; encoding: [0xc1,0x64,0x00,0x7e] -// GFX12XX: v_fract_f64_e32 v[0:1], -1 ; encoding: [0xc1,0x7c,0x00,0x7e] // GFX11: v_fract_f64_e32 v[0:1], -1 ; encoding: [0xc1,0x7c,0x00,0x7e] +// GFX12XX: v_fract_f64_e32 v[0:1], -1 ; encoding: [0xc1,0x7c,0x00,0x7e] +// GFX89: v_fract_f64_e32 v[0:1], -1 ; encoding: [0xc1,0x64,0x00,0x7e] +// SICI: v_fract_f64_e32 v[0:1], -1 ; encoding: [0xc1,0x7c,0x00,0x7e] v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0x06,0x1a] -// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode -// NOGFX1250: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode +// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode +// NOGFX1250: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], lit(1) -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU -// NOGFX11: :[[@LINE-3]]:54: error: invalid operand for instruction -// NOGFX12: :[[@LINE-4]]:54: error: invalid operand for instruction -// NOGFX1250: :[[@LINE-5]]:54: error: invalid operand for instruction +// NOGFX11: :[[@LINE-1]]:54: error: invalid operand for instruction +// NOGFX12: :[[@LINE-2]]:54: error: invalid operand for instruction +// NOGFX1250: :[[@LINE-3]]:54: error: invalid operand for instruction +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_cos_f16_e32 v5.l, 1 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode // GFX11: v_cos_f16_e32 v5.l, 1 ; encoding: [0x81,0xc2,0x0a,0x7e] // GFX1250: v_cos_f16_e32 v5.l, 1 ; encoding: [0x81,0xc2,0x0a,0x7e] -// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode +// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode +// NOGFX89: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_cos_f16_e32 v5.l, lit(1) -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode // GFX11: v_cos_f16_e32 v5.l, lit(0x1) ; encoding: [0xff,0xc2,0x0a,0x7e,0x01,0x00,0x00,0x00] // GFX1250: v_cos_f16_e32 v5.l, lit(0x1) ; encoding: [0xff,0xc2,0x0a,0x7e,0x01,0x00,0x00,0x00] -// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode +// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode +// NOGFX89: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_tanh_bf16 v5, 1 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX1250: v_tanh_bf16_e32 v5, 1 ; encoding: [0x81,0x94,0x0a,0x7e] -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_tanh_bf16 v5, lit(1) -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX1250: v_tanh_bf16_e32 v5, lit(0x1) ; encoding: [0xff,0x94,0x0a,0x7e,0x01,0x00,0x00,0x00] -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_trunc_f32_e32 v0, 1 -// SICI: v_trunc_f32_e32 v0, 1 ; encoding: [0x81,0x42,0x00,0x7e] -// GFX89: v_trunc_f32_e32 v0, 1 ; encoding: [0x81,0x38,0x00,0x7e] -// GFX12XX: v_trunc_f32_e32 v0, 1 ; encoding: [0x81,0x42,0x00,0x7e] // GFX11: v_trunc_f32_e32 v0, 1 ; encoding: [0x81,0x42,0x00,0x7e] +// GFX12XX: v_trunc_f32_e32 v0, 1 ; encoding: [0x81,0x42,0x00,0x7e] +// GFX89: v_trunc_f32_e32 v0, 1 ; encoding: [0x81,0x38,0x00,0x7e] +// SICI: v_trunc_f32_e32 v0, 1 ; encoding: [0x81,0x42,0x00,0x7e] v_trunc_f32_e32 v0, lit(1) -// SICI: v_trunc_f32_e32 v0, lit(0x1) ; encoding: [0xff,0x42,0x00,0x7e,0x01,0x00,0x00,0x00] -// GFX89: v_trunc_f32_e32 v0, lit(0x1) ; encoding: [0xff,0x38,0x00,0x7e,0x01,0x00,0x00,0x00] -// GFX12XX: v_trunc_f32_e32 v0, lit(0x1) ; encoding: [0xff,0x42,0x00,0x7e,0x01,0x00,0x00,0x00] // GFX11: v_trunc_f32_e32 v0, lit(0x1) ; encoding: [0xff,0x42,0x00,0x7e,0x01,0x00,0x00,0x00] +// GFX12XX: v_trunc_f32_e32 v0, lit(0x1) ; encoding: [0xff,0x42,0x00,0x7e,0x01,0x00,0x00,0x00] +// GFX89: v_trunc_f32_e32 v0, lit(0x1) ; encoding: [0xff,0x38,0x00,0x7e,0x01,0x00,0x00,0x00] +// SICI: v_trunc_f32_e32 v0, lit(0x1) ; encoding: [0xff,0x42,0x00,0x7e,0x01,0x00,0x00,0x00] v_dot2_bf16_bf16 v5.l, v1, v2, 1 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_dot2_bf16_bf16 v5.l, v1, v2, 1 ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0x06,0x02] -// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_dot2_bf16_bf16 v5.l, v1, v2, lit(1) -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_dot2_bf16_bf16 v5.l, v1, v2, lit(0x1) ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0xfe,0x03,0x01,0x00,0x00,0x00] -// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_dot2_f32_f16 v5, v1, 1, v2 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_dot2_f32_f16 v5, v1, 1, v2 ; encoding: [0x05,0x40,0x13,0xcc,0x01,0x03,0x09,0x1c] // GFX12: v_dot2_f32_f16 v5, v1, 1, v2 ; encoding: [0x05,0x40,0x13,0xcc,0x01,0x03,0x09,0x1c] -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_dot2_f32_f16 v5, v1, lit(1), v2 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_dot2_f32_f16 v5, v1, lit(0x1), v2 ; encoding: [0x05,0x40,0x13,0xcc,0x01,0xff,0x09,0x1c,0x01,0x00,0x00,0x00] // GFX12: v_dot2_f32_f16 v5, v1, lit(0x1), v2 ; encoding: [0x05,0x40,0x13,0xcc,0x01,0xff,0x09,0x1c,0x01,0x00,0x00,0x00] -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_cvt_pk_fp8_f16 v1.l, 1 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX1250: v_cvt_pk_fp8_f16 v1.l, 1 ; encoding: [0x01,0x00,0x72,0xd7,0xff,0x00,0x00,0x00,0x01,0x00,0x00,0x00] -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_cvt_pk_fp8_f16 v1.l, lit(1) -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX1250: v_cvt_pk_fp8_f16 v1.l, lit(0x1) ; encoding: [0x01,0x00,0x72,0xd7,0xff,0x00,0x00,0x00,0x01,0x00,0x00,0x00] -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU //---------------------------------------------------------------------------// // int literal, expected int operand @@ -755,111 +755,111 @@ s_mov_b64_e32 s[0:1], 0 // SICI: s_mov_b64 s[0:1], 0 ; encoding: [0x80,0x04,0x80,0xbe] v_and_b32_e32 v0, 0, v1 -// SICI: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x36] -// GFX89: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x26] -// GFX12XX: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x36] // GFX11: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x36] +// GFX12XX: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x36] +// GFX89: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x26] +// SICI: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x36] v_and_b32_e64 v0, 0, v1 -// SICI: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x36,0xd2,0x80,0x02,0x02,0x00] -// GFX89: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x13,0xd1,0x80,0x02,0x02,0x00] -// GFX12XX: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0x80,0x02,0x02,0x00] // GFX11: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0x80,0x02,0x02,0x00] +// GFX12XX: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0x80,0x02,0x02,0x00] +// GFX89: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x13,0xd1,0x80,0x02,0x02,0x00] +// SICI: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x36,0xd2,0x80,0x02,0x02,0x00] s_mov_b64_e32 s[0:1], -13 // GFX8PLUS: s_mov_b64 s[0:1], -13 ; encoding: [0xcd,0x01,0x80,0xbe] // SICI: s_mov_b64 s[0:1], -13 ; encoding: [0xcd,0x04,0x80,0xbe] v_and_b32_e32 v0, -13, v1 -// SICI: v_and_b32_e32 v0, -13, v1 ; encoding: [0xcd,0x02,0x00,0x36] -// GFX89: v_and_b32_e32 v0, -13, v1 ; encoding: [0xcd,0x02,0x00,0x26] -// GFX12XX: v_and_b32_e32 v0, -13, v1 ; encoding: [0xcd,0x02,0x00,0x36] // GFX11: v_and_b32_e32 v0, -13, v1 ; encoding: [0xcd,0x02,0x00,0x36] +// GFX12XX: v_and_b32_e32 v0, -13, v1 ; encoding: [0xcd,0x02,0x00,0x36] +// GFX89: v_and_b32_e32 v0, -13, v1 ; encoding: [0xcd,0x02,0x00,0x26] +// SICI: v_and_b32_e32 v0, -13, v1 ; encoding: [0xcd,0x02,0x00,0x36] v_and_b32_e64 v0, -13, v1 -// SICI: v_and_b32_e64 v0, -13, v1 ; encoding: [0x00,0x00,0x36,0xd2,0xcd,0x02,0x02,0x00] -// GFX89: v_and_b32_e64 v0, -13, v1 ; encoding: [0x00,0x00,0x13,0xd1,0xcd,0x02,0x02,0x00] -// GFX12XX: v_and_b32_e64 v0, -13, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xcd,0x02,0x02,0x00] // GFX11: v_and_b32_e64 v0, -13, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xcd,0x02,0x02,0x00] +// GFX12XX: v_and_b32_e64 v0, -13, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xcd,0x02,0x02,0x00] +// GFX89: v_and_b32_e64 v0, -13, v1 ; encoding: [0x00,0x00,0x13,0xd1,0xcd,0x02,0x02,0x00] +// SICI: v_and_b32_e64 v0, -13, v1 ; encoding: [0x00,0x00,0x36,0xd2,0xcd,0x02,0x02,0x00] s_mov_b64_e32 s[0:1], 35 // GFX8PLUS: s_mov_b64 s[0:1], 35 ; encoding: [0xa3,0x01,0x80,0xbe] // SICI: s_mov_b64 s[0:1], 35 ; encoding: [0xa3,0x04,0x80,0xbe] v_and_b32_e32 v0, 35, v1 -// SICI: v_and_b32_e32 v0, 35, v1 ; encoding: [0xa3,0x02,0x00,0x36] -// GFX89: v_and_b32_e32 v0, 35, v1 ; encoding: [0xa3,0x02,0x00,0x26] -// GFX12XX: v_and_b32_e32 v0, 35, v1 ; encoding: [0xa3,0x02,0x00,0x36] // GFX11: v_and_b32_e32 v0, 35, v1 ; encoding: [0xa3,0x02,0x00,0x36] +// GFX12XX: v_and_b32_e32 v0, 35, v1 ; encoding: [0xa3,0x02,0x00,0x36] +// GFX89: v_and_b32_e32 v0, 35, v1 ; encoding: [0xa3,0x02,0x00,0x26] +// SICI: v_and_b32_e32 v0, 35, v1 ; encoding: [0xa3,0x02,0x00,0x36] v_and_b32_e64 v0, 35, v1 -// SICI: v_and_b32_e64 v0, 35, v1 ; encoding: [0x00,0x00,0x36,0xd2,0xa3,0x02,0x02,0x00] -// GFX89: v_and_b32_e64 v0, 35, v1 ; encoding: [0x00,0x00,0x13,0xd1,0xa3,0x02,0x02,0x00] -// GFX12XX: v_and_b32_e64 v0, 35, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xa3,0x02,0x02,0x00] // GFX11: v_and_b32_e64 v0, 35, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xa3,0x02,0x02,0x00] +// GFX12XX: v_and_b32_e64 v0, 35, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xa3,0x02,0x02,0x00] +// GFX89: v_and_b32_e64 v0, 35, v1 ; encoding: [0x00,0x00,0x13,0xd1,0xa3,0x02,0x02,0x00] +// SICI: v_and_b32_e64 v0, 35, v1 ; encoding: [0x00,0x00,0x36,0xd2,0xa3,0x02,0x02,0x00] s_mov_b64_e32 s[0:1], 1234 // GFX8PLUS: s_mov_b64 s[0:1], 0x4d2 ; encoding: [0xff,0x01,0x80,0xbe,0xd2,0x04,0x00,0x00] // SICI: s_mov_b64 s[0:1], 0x4d2 ; encoding: [0xff,0x04,0x80,0xbe,0xd2,0x04,0x00,0x00] v_and_b32_e32 v0, 1234, v1 -// SICI: v_and_b32_e32 v0, 0x4d2, v1 ; encoding: [0xff,0x02,0x00,0x36,0xd2,0x04,0x00,0x00] -// GFX89: v_and_b32_e32 v0, 0x4d2, v1 ; encoding: [0xff,0x02,0x00,0x26,0xd2,0x04,0x00,0x00] -// GFX12XX: v_and_b32_e32 v0, 0x4d2, v1 ; encoding: [0xff,0x02,0x00,0x36,0xd2,0x04,0x00,0x00] // GFX11: v_and_b32_e32 v0, 0x4d2, v1 ; encoding: [0xff,0x02,0x00,0x36,0xd2,0x04,0x00,0x00] +// GFX12XX: v_and_b32_e32 v0, 0x4d2, v1 ; encoding: [0xff,0x02,0x00,0x36,0xd2,0x04,0x00,0x00] +// GFX89: v_and_b32_e32 v0, 0x4d2, v1 ; encoding: [0xff,0x02,0x00,0x26,0xd2,0x04,0x00,0x00] +// SICI: v_and_b32_e32 v0, 0x4d2, v1 ; encoding: [0xff,0x02,0x00,0x36,0xd2,0x04,0x00,0x00] v_and_b32_e64 v0, 1234, v1 +// GFX11: v_and_b32_e64 v0, 0x4d2, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xff,0x02,0x02,0x00,0xd2,0x04,0x00,0x00] // GFX12XX: v_and_b32_e64 v0, 0x4d2, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xff,0x02,0x02,0x00,0xd2,0x04,0x00,0x00] -// NOSICI: :[[@LINE-2]]:19: error: literal operands are not supported // NOGFX89: :[[@LINE-3]]:19: error: literal operands are not supported -// GFX11: v_and_b32_e64 v0, 0x4d2, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xff,0x02,0x02,0x00,0xd2,0x04,0x00,0x00] +// NOSICI: :[[@LINE-4]]:19: error: literal operands are not supported // NOSICIVI: :[[@LINE-1]]:19: error: literal operands are not supported s_mov_b64_e32 s[0:1], -54321 -// SICI: s_mov_b64 s[0:1], 0xffff2bcf ; encoding: [0xff,0x04,0x80,0xbe,0xcf,0x2b,0xff,0xff] -// GFX89: s_mov_b64 s[0:1], 0xffff2bcf ; encoding: [0xff,0x01,0x80,0xbe,0xcf,0x2b,0xff,0xff] // GFX11: s_mov_b64 s[0:1], 0xffff2bcf ; encoding: [0xff,0x01,0x80,0xbe,0xcf,0x2b,0xff,0xff] // GFX12: s_mov_b64 s[0:1], 0xffff2bcf ; encoding: [0xff,0x01,0x80,0xbe,0xcf,0x2b,0xff,0xff] // GFX1250: s_mov_b64 s[0:1], 0xffffffffffff2bcf ; encoding: [0xfe,0x01,0x80,0xbe,0xcf,0x2b,0xff,0xff,0xff,0xff,0xff,0xff] +// GFX89: s_mov_b64 s[0:1], 0xffff2bcf ; encoding: [0xff,0x01,0x80,0xbe,0xcf,0x2b,0xff,0xff] +// SICI: s_mov_b64 s[0:1], 0xffff2bcf ; encoding: [0xff,0x04,0x80,0xbe,0xcf,0x2b,0xff,0xff] v_and_b32_e32 v0, -54321, v1 -// SICI: v_and_b32_e32 v0, 0xffff2bcf, v1 ; encoding: [0xff,0x02,0x00,0x36,0xcf,0x2b,0xff,0xff] -// GFX89: v_and_b32_e32 v0, 0xffff2bcf, v1 ; encoding: [0xff,0x02,0x00,0x26,0xcf,0x2b,0xff,0xff] -// GFX12XX: v_and_b32_e32 v0, 0xffff2bcf, v1 ; encoding: [0xff,0x02,0x00,0x36,0xcf,0x2b,0xff,0xff] // GFX11: v_and_b32_e32 v0, 0xffff2bcf, v1 ; encoding: [0xff,0x02,0x00,0x36,0xcf,0x2b,0xff,0xff] +// GFX12XX: v_and_b32_e32 v0, 0xffff2bcf, v1 ; encoding: [0xff,0x02,0x00,0x36,0xcf,0x2b,0xff,0xff] +// GFX89: v_and_b32_e32 v0, 0xffff2bcf, v1 ; encoding: [0xff,0x02,0x00,0x26,0xcf,0x2b,0xff,0xff] +// SICI: v_and_b32_e32 v0, 0xffff2bcf, v1 ; encoding: [0xff,0x02,0x00,0x36,0xcf,0x2b,0xff,0xff] s_mov_b64_e32 s[0:1], 0xdeadbeef -// SICI: s_mov_b64 s[0:1], 0xdeadbeef ; encoding: [0xff,0x04,0x80,0xbe,0xef,0xbe,0xad,0xde] -// GFX89: s_mov_b64 s[0:1], 0xdeadbeef ; encoding: [0xff,0x01,0x80,0xbe,0xef,0xbe,0xad,0xde] // GFX11: s_mov_b64 s[0:1], 0xdeadbeef ; encoding: [0xff,0x01,0x80,0xbe,0xef,0xbe,0xad,0xde] // GFX12: s_mov_b64 s[0:1], 0xdeadbeef ; encoding: [0xff,0x01,0x80,0xbe,0xef,0xbe,0xad,0xde] // GFX1250: s_mov_b64 s[0:1], 0xdeadbeef ; encoding: [0xfe,0x01,0x80,0xbe,0xef,0xbe,0xad,0xde,0x00,0x00,0x00,0x00] +// GFX89: s_mov_b64 s[0:1], 0xdeadbeef ; encoding: [0xff,0x01,0x80,0xbe,0xef,0xbe,0xad,0xde] +// SICI: s_mov_b64 s[0:1], 0xdeadbeef ; encoding: [0xff,0x04,0x80,0xbe,0xef,0xbe,0xad,0xde] v_and_b32_e32 v0, 0xdeadbeef, v1 -// SICI: v_and_b32_e32 v0, 0xdeadbeef, v1 ; encoding: [0xff,0x02,0x00,0x36,0xef,0xbe,0xad,0xde] -// GFX89: v_and_b32_e32 v0, 0xdeadbeef, v1 ; encoding: [0xff,0x02,0x00,0x26,0xef,0xbe,0xad,0xde] -// GFX12XX: v_and_b32_e32 v0, 0xdeadbeef, v1 ; encoding: [0xff,0x02,0x00,0x36,0xef,0xbe,0xad,0xde] // GFX11: v_and_b32_e32 v0, 0xdeadbeef, v1 ; encoding: [0xff,0x02,0x00,0x36,0xef,0xbe,0xad,0xde] +// GFX12XX: v_and_b32_e32 v0, 0xdeadbeef, v1 ; encoding: [0xff,0x02,0x00,0x36,0xef,0xbe,0xad,0xde] +// GFX89: v_and_b32_e32 v0, 0xdeadbeef, v1 ; encoding: [0xff,0x02,0x00,0x26,0xef,0xbe,0xad,0xde] +// SICI: v_and_b32_e32 v0, 0xdeadbeef, v1 ; encoding: [0xff,0x02,0x00,0x36,0xef,0xbe,0xad,0xde] s_mov_b64_e32 s[0:1], 0xffffffff -// SICI: s_mov_b64 s[0:1], 0xffffffff ; encoding: [0xff,0x04,0x80,0xbe,0xff,0xff,0xff,0xff] -// GFX89: s_mov_b64 s[0:1], 0xffffffff ; encoding: [0xff,0x01,0x80,0xbe,0xff,0xff,0xff,0xff] // GFX11: s_mov_b64 s[0:1], 0xffffffff ; encoding: [0xff,0x01,0x80,0xbe,0xff,0xff,0xff,0xff] // GFX12: s_mov_b64 s[0:1], 0xffffffff ; encoding: [0xff,0x01,0x80,0xbe,0xff,0xff,0xff,0xff] // GFX1250: s_mov_b64 s[0:1], 0xffffffff ; encoding: [0xfe,0x01,0x80,0xbe,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00] +// GFX89: s_mov_b64 s[0:1], 0xffffffff ; encoding: [0xff,0x01,0x80,0xbe,0xff,0xff,0xff,0xff] +// SICI: s_mov_b64 s[0:1], 0xffffffff ; encoding: [0xff,0x04,0x80,0xbe,0xff,0xff,0xff,0xff] v_and_b32_e32 v0, 0xffffffff, v1 -// SICI: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x36] -// GFX89: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x26] -// GFX12XX: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x36] // GFX11: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x36] +// GFX12XX: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x36] +// GFX89: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x26] +// SICI: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x36] s_mov_b64_e32 s[0:1], 0x123456789abcdef0 -// NOSICI: :[[@LINE-1]]:23: error: invalid operand for instruction -// NOGFX89: :[[@LINE-2]]:23: error: invalid operand for instruction // GFX1250: s_mov_b64 s[0:1], 0x123456789abcdef0 ; encoding: [0xfe,0x01,0x80,0xbe,0xf0,0xde,0xbc,0x9a,0x78,0x56,0x34,0x12] -// NOGFX11: :[[@LINE-4]]:23: error: invalid operand for instruction -// NOGFX12: :[[@LINE-5]]:23: error: invalid operand for instruction +// NOGFX11: :[[@LINE-2]]:23: error: invalid operand for instruction +// NOGFX12: :[[@LINE-3]]:23: error: invalid operand for instruction +// NOGFX89: :[[@LINE-4]]:23: error: invalid operand for instruction +// NOSICI: :[[@LINE-5]]:23: error: invalid operand for instruction // NOSICIVI: :[[@LINE-1]]:23: error: invalid operand for instruction v_and_b32_e32 v0, 0x123456789abcdef0, v1 @@ -870,75 +870,75 @@ s_mov_b64_e32 s[0:1], 0xffffffffffffffff // SICI: s_mov_b64 s[0:1], -1 ; encoding: [0xc1,0x04,0x80,0xbe] v_and_b32_e32 v0, 0xffffffffffffffff, v1 -// SICI: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x36] -// GFX89: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x26] -// GFX12XX: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x36] // GFX11: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x36] +// GFX12XX: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x36] +// GFX89: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x26] +// SICI: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x36] v_not_b16 v5.l, 1 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_not_b16_e32 v5.l, 1 ; encoding: [0x81,0xd2,0x0a,0x7e] // GFX1250: v_not_b16_e32 v5.l, 1 ; encoding: [0x81,0xd2,0x0a,0x7e] -// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode +// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_not_b16 v5.l, lit(1) -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_not_b16_e32 v5.l, lit(0x1) ; encoding: [0xff,0xd2,0x0a,0x7e,0x01,0x00,0x00,0x00] // GFX1250: v_not_b16_e32 v5.l, lit(0x1) ; encoding: [0xff,0xd2,0x0a,0x7e,0x01,0x00,0x00,0x00] -// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode +// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU s_mov_b64 s[0:1], 1 // GFX8PLUS: s_mov_b64 s[0:1], 1 ; encoding: [0x81,0x01,0x80,0xbe] // SICI: s_mov_b64 s[0:1], 1 ; encoding: [0x81,0x04,0x80,0xbe] s_mov_b64 s[0:1], lit(1) -// SICI: s_mov_b64 s[0:1], lit(0x1) ; encoding: [0xff,0x04,0x80,0xbe,0x01,0x00,0x00,0x00] -// GFX89: s_mov_b64 s[0:1], lit(0x1) ; encoding: [0xff,0x01,0x80,0xbe,0x01,0x00,0x00,0x00] // GFX11: s_mov_b64 s[0:1], lit(0x1) ; encoding: [0xff,0x01,0x80,0xbe,0x01,0x00,0x00,0x00] // GFX12: s_mov_b64 s[0:1], lit(0x1) ; encoding: [0xff,0x01,0x80,0xbe,0x01,0x00,0x00,0x00] // GFX1250: s_mov_b64 s[0:1], lit(0x1) ; encoding: [0xfe,0x01,0x80,0xbe,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00] +// GFX89: s_mov_b64 s[0:1], lit(0x1) ; encoding: [0xff,0x01,0x80,0xbe,0x01,0x00,0x00,0x00] +// SICI: s_mov_b64 s[0:1], lit(0x1) ; encoding: [0xff,0x04,0x80,0xbe,0x01,0x00,0x00,0x00] v_and_b32_e32 v0, 1, v1 -// SICI: v_and_b32_e32 v0, 1, v1 ; encoding: [0x81,0x02,0x00,0x36] -// GFX89: v_and_b32_e32 v0, 1, v1 ; encoding: [0x81,0x02,0x00,0x26] -// GFX12XX: v_and_b32_e32 v0, 1, v1 ; encoding: [0x81,0x02,0x00,0x36] // GFX11: v_and_b32_e32 v0, 1, v1 ; encoding: [0x81,0x02,0x00,0x36] +// GFX12XX: v_and_b32_e32 v0, 1, v1 ; encoding: [0x81,0x02,0x00,0x36] +// GFX89: v_and_b32_e32 v0, 1, v1 ; encoding: [0x81,0x02,0x00,0x26] +// SICI: v_and_b32_e32 v0, 1, v1 ; encoding: [0x81,0x02,0x00,0x36] v_and_b32_e32 v0, lit(1), v1 -// SICI: v_and_b32_e32 v0, lit(0x1), v1 ; encoding: [0xff,0x02,0x00,0x36,0x01,0x00,0x00,0x00] -// GFX89: v_and_b32_e32 v0, lit(0x1), v1 ; encoding: [0xff,0x02,0x00,0x26,0x01,0x00,0x00,0x00] -// GFX12XX: v_and_b32_e32 v0, lit(0x1), v1 ; encoding: [0xff,0x02,0x00,0x36,0x01,0x00,0x00,0x00] // GFX11: v_and_b32_e32 v0, lit(0x1), v1 ; encoding: [0xff,0x02,0x00,0x36,0x01,0x00,0x00,0x00] +// GFX12XX: v_and_b32_e32 v0, lit(0x1), v1 ; encoding: [0xff,0x02,0x00,0x36,0x01,0x00,0x00,0x00] +// GFX89: v_and_b32_e32 v0, lit(0x1), v1 ; encoding: [0xff,0x02,0x00,0x26,0x01,0x00,0x00,0x00] +// SICI: v_and_b32_e32 v0, lit(0x1), v1 ; encoding: [0xff,0x02,0x00,0x36,0x01,0x00,0x00,0x00] v_pk_add_u16 v5, exec_lo, 1 +// GFX11: v_pk_add_u16 v5, exec_lo, 1 ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0x02,0x01,0x18] // GFX12XX: v_pk_add_u16 v5, exec_lo, 1 ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0x02,0x01,0x18] -// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX9: v_pk_add_u16 v5, exec_lo, 1 ; encoding: [0x05,0x40,0x8a,0xd3,0x7e,0x02,0x01,0x18] -// GFX11: v_pk_add_u16 v5, exec_lo, 1 ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0x02,0x01,0x18] +// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU // NOVI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_pk_add_u16 v5, exec_lo, lit(1) -// GFX12XX: v_pk_add_u16 v5, exec_lo, lit(0x1) ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x18,0x01,0x00,0x00,0x00] -// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_pk_add_u16 v5, exec_lo, lit(0x1) ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x18,0x01,0x00,0x00,0x00] -// NOVI: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX9: :[[@LINE-5]]:31: error: invalid operand (violates constant bus restrictions) +// GFX12XX: v_pk_add_u16 v5, exec_lo, lit(0x1) ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x18,0x01,0x00,0x00,0x00] +// NOGFX9: :[[@LINE-3]]:31: error: invalid operand (violates constant bus restrictions) +// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 1 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX1250: v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 1 ; encoding: [0x02,0x00,0x42,0xd6,0x04,0x09,0x06,0x02] -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], lit(1) -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX1250: v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], lit(0x1) ; encoding: [0x02,0x00,0x42,0xd6,0x04,0x09,0xfe,0x03,0x01,0x00,0x00,0x00] -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU //---------------------------------------------------------------------------// // 1/(2*PI) @@ -948,46 +948,46 @@ v_trunc_f32_e32 v0, 0x3fc45f306dc9c882 // NOGCN: :[[@LINE-1]]:21: error: invalid operand for instruction v_fract_f64_e32 v[0:1], 0x3fc45f306dc9c882 -// GFX89: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x64,0x00,0x7e] -// GFX12XX: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x7c,0x00,0x7e] -// NOSICI: :[[@LINE-3]]:25: error: invalid operand for instruction // GFX11: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x7c,0x00,0x7e] +// GFX12XX: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x7c,0x00,0x7e] +// GFX89: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x64,0x00,0x7e] +// NOSICI: :[[@LINE-4]]:25: error: invalid operand for instruction // NOSICIVI: :[[@LINE-2]]:25: error: invalid operand for instruction v_trunc_f32_e32 v0, 0x3e22f983 -// SICI: v_trunc_f32_e32 v0, 0x3e22f983 ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e] -// GFX89: v_trunc_f32_e32 v0, 0.15915494 ; encoding: [0xf8,0x38,0x00,0x7e] -// GFX12XX: v_trunc_f32_e32 v0, 0.15915494 ; encoding: [0xf8,0x42,0x00,0x7e] // GFX11: v_trunc_f32_e32 v0, 0.15915494 ; encoding: [0xf8,0x42,0x00,0x7e] +// GFX12XX: v_trunc_f32_e32 v0, 0.15915494 ; encoding: [0xf8,0x42,0x00,0x7e] +// GFX89: v_trunc_f32_e32 v0, 0.15915494 ; encoding: [0xf8,0x38,0x00,0x7e] +// SICI: v_trunc_f32_e32 v0, 0x3e22f983 ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e] v_fract_f64_e32 v[0:1], 0x3e22f983 -// SICI: v_fract_f64_e32 v[0:1], 0x3e22f983 ; encoding: [0xff,0x7c,0x00,0x7e,0x83,0xf9,0x22,0x3e] -// GFX89: v_fract_f64_e32 v[0:1], 0x3e22f983 ; encoding: [0xff,0x64,0x00,0x7e,0x83,0xf9,0x22,0x3e] -// GFX12XX: v_fract_f64_e32 v[0:1], 0x3e22f983 ; encoding: [0xff,0x7c,0x00,0x7e,0x83,0xf9,0x22,0x3e] // GFX11: v_fract_f64_e32 v[0:1], 0x3e22f983 ; encoding: [0xff,0x7c,0x00,0x7e,0x83,0xf9,0x22,0x3e] +// GFX12XX: v_fract_f64_e32 v[0:1], 0x3e22f983 ; encoding: [0xff,0x7c,0x00,0x7e,0x83,0xf9,0x22,0x3e] +// GFX89: v_fract_f64_e32 v[0:1], 0x3e22f983 ; encoding: [0xff,0x64,0x00,0x7e,0x83,0xf9,0x22,0x3e] +// SICI: v_fract_f64_e32 v[0:1], 0x3e22f983 ; encoding: [0xff,0x7c,0x00,0x7e,0x83,0xf9,0x22,0x3e] v_trunc_f32_e64 v0, 0x3fc45f306dc9c882 // NOGCN: :[[@LINE-1]]:21: error: invalid operand for instruction v_fract_f64_e64 v[0:1], 0x3fc45f306dc9c882 -// GFX89: v_fract_f64_e64 v[0:1], 0.15915494309189532 ; encoding: [0x00,0x00,0x72,0xd1,0xf8,0x00,0x00,0x00] -// GFX12XX: v_fract_f64_e64 v[0:1], 0.15915494309189532 ; encoding: [0x00,0x00,0xbe,0xd5,0xf8,0x00,0x00,0x00] -// NOSICI: :[[@LINE-3]]:25: error: invalid operand for instruction // GFX11: v_fract_f64_e64 v[0:1], 0.15915494309189532 ; encoding: [0x00,0x00,0xbe,0xd5,0xf8,0x00,0x00,0x00] +// GFX12XX: v_fract_f64_e64 v[0:1], 0.15915494309189532 ; encoding: [0x00,0x00,0xbe,0xd5,0xf8,0x00,0x00,0x00] +// GFX89: v_fract_f64_e64 v[0:1], 0.15915494309189532 ; encoding: [0x00,0x00,0x72,0xd1,0xf8,0x00,0x00,0x00] +// NOSICI: :[[@LINE-4]]:25: error: invalid operand for instruction // NOSICIVI: :[[@LINE-2]]:25: error: invalid operand for instruction v_trunc_f32_e64 v0, 0x3e22f983 -// GFX89: v_trunc_f32_e64 v0, 0.15915494 ; encoding: [0x00,0x00,0x5c,0xd1,0xf8,0x00,0x00,0x00] -// GFX12XX: v_trunc_f32_e64 v0, 0.15915494 ; encoding: [0x00,0x00,0xa1,0xd5,0xf8,0x00,0x00,0x00] -// NOSICI: :[[@LINE-3]]:21: error: literal operands are not supported // GFX11: v_trunc_f32_e64 v0, 0.15915494 ; encoding: [0x00,0x00,0xa1,0xd5,0xf8,0x00,0x00,0x00] +// GFX12XX: v_trunc_f32_e64 v0, 0.15915494 ; encoding: [0x00,0x00,0xa1,0xd5,0xf8,0x00,0x00,0x00] +// GFX89: v_trunc_f32_e64 v0, 0.15915494 ; encoding: [0x00,0x00,0x5c,0xd1,0xf8,0x00,0x00,0x00] +// NOSICI: :[[@LINE-4]]:21: error: literal operands are not supported // NOSICIVI: :[[@LINE-2]]:21: error: literal operands are not supported v_fract_f64_e64 v[0:1], 0x3e22f983 +// GFX11: v_fract_f64_e64 v[0:1], 0x3e22f983 ; encoding: [0x00,0x00,0xbe,0xd5,0xff,0x00,0x00,0x00,0x83,0xf9,0x22,0x3e] // GFX12XX: v_fract_f64_e64 v[0:1], 0x3e22f983 ; encoding: [0x00,0x00,0xbe,0xd5,0xff,0x00,0x00,0x00,0x83,0xf9,0x22,0x3e] -// NOSICI: :[[@LINE-2]]:25: error: literal operands are not supported // NOGFX89: :[[@LINE-3]]:25: error: literal operands are not supported -// GFX11: v_fract_f64_e64 v[0:1], 0x3e22f983 ; encoding: [0x00,0x00,0xbe,0xd5,0xff,0x00,0x00,0x00,0x83,0xf9,0x22,0x3e] +// NOSICI: :[[@LINE-4]]:25: error: literal operands are not supported // NOSICIVI: :[[@LINE-1]]:25: error: literal operands are not supported s_mov_b64_e32 s[0:1], 0.159154943091895317852646485335 @@ -996,37 +996,37 @@ s_mov_b64_e32 s[0:1], 0.159154943091895317852646485335 // NOSICIVI: :[[@LINE-2]]:23: error: invalid operand for instruction v_and_b32_e32 v0, 0.159154943091895317852646485335, v1 -// SICI: v_and_b32_e32 v0, 0x3e22f983, v1 ; encoding: [0xff,0x02,0x00,0x36,0x83,0xf9,0x22,0x3e] -// GFX89: v_and_b32_e32 v0, 0.15915494, v1 ; encoding: [0xf8,0x02,0x00,0x26] -// GFX12XX: v_and_b32_e32 v0, 0.15915494, v1 ; encoding: [0xf8,0x02,0x00,0x36] // GFX11: v_and_b32_e32 v0, 0.15915494, v1 ; encoding: [0xf8,0x02,0x00,0x36] +// GFX12XX: v_and_b32_e32 v0, 0.15915494, v1 ; encoding: [0xf8,0x02,0x00,0x36] +// GFX89: v_and_b32_e32 v0, 0.15915494, v1 ; encoding: [0xf8,0x02,0x00,0x26] +// SICI: v_and_b32_e32 v0, 0x3e22f983, v1 ; encoding: [0xff,0x02,0x00,0x36,0x83,0xf9,0x22,0x3e] v_and_b32_e64 v0, 0.159154943091895317852646485335, v1 -// GFX89: v_and_b32_e64 v0, 0.15915494, v1 ; encoding: [0x00,0x00,0x13,0xd1,0xf8,0x02,0x02,0x00] -// GFX12XX: v_and_b32_e64 v0, 0.15915494, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xf8,0x02,0x02,0x00] -// NOSICI: :[[@LINE-3]]:19: error: literal operands are not supported // GFX11: v_and_b32_e64 v0, 0.15915494, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xf8,0x02,0x02,0x00] +// GFX12XX: v_and_b32_e64 v0, 0.15915494, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xf8,0x02,0x02,0x00] +// GFX89: v_and_b32_e64 v0, 0.15915494, v1 ; encoding: [0x00,0x00,0x13,0xd1,0xf8,0x02,0x02,0x00] +// NOSICI: :[[@LINE-4]]:19: error: literal operands are not supported // NOSICIVI: :[[@LINE-2]]:19: error: literal operands are not supported v_fract_f64 v[0:1], 0.159154943091895317852646485335 -// SICI: v_fract_f64_e32 v[0:1], 0x3fc45f30 ; encoding: [0xff,0x7c,0x00,0x7e,0x30,0x5f,0xc4,0x3f] -// GFX89: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x64,0x00,0x7e] +// GFX11: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x7c,0x00,0x7e] // GFX12XX: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x7c,0x00,0x7e] +// GFX89: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x64,0x00,0x7e] // NOSICI: :[[@LINE-4]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero -// GFX11: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x7c,0x00,0x7e] +// SICI: v_fract_f64_e32 v[0:1], 0x3fc45f30 ; encoding: [0xff,0x7c,0x00,0x7e,0x30,0x5f,0xc4,0x3f] // NOSICIVI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero v_trunc_f32 v0, 0.159154943091895317852646485335 -// SICI: v_trunc_f32_e32 v0, 0x3e22f983 ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e] -// GFX89: v_trunc_f32_e32 v0, 0.15915494 ; encoding: [0xf8,0x38,0x00,0x7e] -// GFX12XX: v_trunc_f32_e32 v0, 0.15915494 ; encoding: [0xf8,0x42,0x00,0x7e] // GFX11: v_trunc_f32_e32 v0, 0.15915494 ; encoding: [0xf8,0x42,0x00,0x7e] +// GFX12XX: v_trunc_f32_e32 v0, 0.15915494 ; encoding: [0xf8,0x42,0x00,0x7e] +// GFX89: v_trunc_f32_e32 v0, 0.15915494 ; encoding: [0xf8,0x38,0x00,0x7e] +// SICI: v_trunc_f32_e32 v0, 0x3e22f983 ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e] v_trunc_f32 v0, lit(0.159154943091895317852646485335) -// SICI: v_trunc_f32_e32 v0, lit(0x3e22f983) ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e] -// GFX89: v_trunc_f32_e32 v0, lit(0x3e22f983) ; encoding: [0xff,0x38,0x00,0x7e,0x83,0xf9,0x22,0x3e] -// GFX12XX: v_trunc_f32_e32 v0, lit(0x3e22f983) ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e] // GFX11: v_trunc_f32_e32 v0, lit(0x3e22f983) ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e] +// GFX12XX: v_trunc_f32_e32 v0, lit(0x3e22f983) ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e] +// GFX89: v_trunc_f32_e32 v0, lit(0x3e22f983) ; encoding: [0xff,0x38,0x00,0x7e,0x83,0xf9,0x22,0x3e] +// SICI: v_trunc_f32_e32 v0, lit(0x3e22f983) ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e] //---------------------------------------------------------------------------// // integer literal truncation checks @@ -1051,54 +1051,54 @@ v_trunc_f32 v0, 0x1fffffff000 // NOGCN: :[[@LINE-1]]:17: error: invalid operand for instruction s_mov_b64 s[0:1], 0x101ffffffff -// NOSICI: :[[@LINE-1]]:19: error: invalid operand for instruction -// NOGFX89: :[[@LINE-2]]:19: error: invalid operand for instruction // GFX1250: s_mov_b64 s[0:1], 0x101ffffffff ; encoding: [0xfe,0x01,0x80,0xbe,0xff,0xff,0xff,0xff,0x01,0x01,0x00,0x00] -// NOGFX11: :[[@LINE-4]]:19: error: invalid operand for instruction -// NOGFX12: :[[@LINE-5]]:19: error: invalid operand for instruction +// NOGFX11: :[[@LINE-2]]:19: error: invalid operand for instruction +// NOGFX12: :[[@LINE-3]]:19: error: invalid operand for instruction +// NOGFX89: :[[@LINE-4]]:19: error: invalid operand for instruction +// NOSICI: :[[@LINE-5]]:19: error: invalid operand for instruction // NOSICIVI: :[[@LINE-1]]:19: error: invalid operand for instruction s_mov_b64 s[0:1], 0x1000000001 -// NOSICI: :[[@LINE-1]]:19: error: invalid operand for instruction -// NOGFX89: :[[@LINE-2]]:19: error: invalid operand for instruction // GFX1250: s_mov_b64 s[0:1], 0x1000000001 ; encoding: [0xfe,0x01,0x80,0xbe,0x01,0x00,0x00,0x00,0x10,0x00,0x00,0x00] -// NOGFX11: :[[@LINE-4]]:19: error: invalid operand for instruction -// NOGFX12: :[[@LINE-5]]:19: error: invalid operand for instruction +// NOGFX11: :[[@LINE-2]]:19: error: invalid operand for instruction +// NOGFX12: :[[@LINE-3]]:19: error: invalid operand for instruction +// NOGFX89: :[[@LINE-4]]:19: error: invalid operand for instruction +// NOSICI: :[[@LINE-5]]:19: error: invalid operand for instruction // NOSICIVI: :[[@LINE-1]]:19: error: invalid operand for instruction s_mov_b64 s[0:1], 0x1000000fff -// NOSICI: :[[@LINE-1]]:19: error: invalid operand for instruction -// NOGFX89: :[[@LINE-2]]:19: error: invalid operand for instruction // GFX1250: s_mov_b64 s[0:1], 0x1000000fff ; encoding: [0xfe,0x01,0x80,0xbe,0xff,0x0f,0x00,0x00,0x10,0x00,0x00,0x00] -// NOGFX11: :[[@LINE-4]]:19: error: invalid operand for instruction -// NOGFX12: :[[@LINE-5]]:19: error: invalid operand for instruction +// NOGFX11: :[[@LINE-2]]:19: error: invalid operand for instruction +// NOGFX12: :[[@LINE-3]]:19: error: invalid operand for instruction +// NOGFX89: :[[@LINE-4]]:19: error: invalid operand for instruction +// NOSICI: :[[@LINE-5]]:19: error: invalid operand for instruction // NOSICIVI: :[[@LINE-1]]:19: error: invalid operand for instruction v_trunc_f64 v[0:1], 0x1fffffffff0 -// NOGFX89: :[[@LINE-1]]:21: error: invalid operand for instruction // GFX1250: v_trunc_f64_e32 v[0:1], 0x1fffffffff0 ; encoding: [0xfe,0x2e,0x00,0x7e,0xf0,0xff,0xff,0xff,0xff,0x01,0x00,0x00] -// NOSI: :[[@LINE-3]]:1: error: instruction not supported on this GPU -// NOCI: :[[@LINE-4]]:21: error: invalid operand for instruction -// NOGFX11: :[[@LINE-5]]:21: error: invalid operand for instruction -// NOGFX12: :[[@LINE-6]]:21: error: invalid operand for instruction +// NOCI: :[[@LINE-2]]:21: error: invalid operand for instruction +// NOGFX11: :[[@LINE-3]]:21: error: invalid operand for instruction +// NOGFX12: :[[@LINE-4]]:21: error: invalid operand for instruction +// NOGFX89: :[[@LINE-5]]:21: error: invalid operand for instruction +// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU // NOCIVI: :[[@LINE-4]]:21: error: invalid operand for instruction v_trunc_f64 v[0:1], 0x100000001 -// NOGFX89: :[[@LINE-1]]:21: error: invalid operand for instruction // GFX1250: v_trunc_f64_e32 v[0:1], 0x100000001 ; encoding: [0xfe,0x2e,0x00,0x7e,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00] -// NOSI: :[[@LINE-3]]:1: error: instruction not supported on this GPU -// NOCI: :[[@LINE-4]]:21: error: invalid operand for instruction -// NOGFX11: :[[@LINE-5]]:21: error: invalid operand for instruction -// NOGFX12: :[[@LINE-6]]:21: error: invalid operand for instruction +// NOCI: :[[@LINE-2]]:21: error: invalid operand for instruction +// NOGFX11: :[[@LINE-3]]:21: error: invalid operand for instruction +// NOGFX12: :[[@LINE-4]]:21: error: invalid operand for instruction +// NOGFX89: :[[@LINE-5]]:21: error: invalid operand for instruction +// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU // NOCIVI: :[[@LINE-4]]:21: error: invalid operand for instruction v_trunc_f64 v[0:1], 0x1fffffff000 -// NOGFX89: :[[@LINE-1]]:21: error: invalid operand for instruction // GFX1250: v_trunc_f64_e32 v[0:1], 0x1fffffff000 ; encoding: [0xfe,0x2e,0x00,0x7e,0x00,0xf0,0xff,0xff,0xff,0x01,0x00,0x00] -// NOSI: :[[@LINE-3]]:1: error: instruction not supported on this GPU -// NOCI: :[[@LINE-4]]:21: error: invalid operand for instruction -// NOGFX11: :[[@LINE-5]]:21: error: invalid operand for instruction -// NOGFX12: :[[@LINE-6]]:21: error: invalid operand for instruction +// NOCI: :[[@LINE-2]]:21: error: invalid operand for instruction +// NOGFX11: :[[@LINE-3]]:21: error: invalid operand for instruction +// NOGFX12: :[[@LINE-4]]:21: error: invalid operand for instruction +// NOGFX89: :[[@LINE-5]]:21: error: invalid operand for instruction +// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU // NOCIVI: :[[@LINE-4]]:21: error: invalid operand for instruction //---------------------------------------------------------------------------// @@ -1106,210 +1106,210 @@ v_trunc_f64 v[0:1], 0x1fffffff000 //---------------------------------------------------------------------------// buffer_atomic_add v0, off, s[0:3], scc offset:4095 -// SICI: buffer_atomic_add v0, off, s[0:3], src_scc offset:4095 ; encoding: [0xff,0x0f,0xc8,0xe0,0x00,0x00,0x00,0xfd] -// GFX89: buffer_atomic_add v0, off, s[0:3], src_scc offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x00,0x00,0xfd] -// GFX12XX: buffer_atomic_add_u32 v0, off, s[0:3], src_scc offset:4095 ; encoding: [0x7d,0x40,0x0d,0xc4,0x00,0x00,0x80,0x00,0x00,0xff,0x0f,0x00] // GFX11: buffer_atomic_add_u32 v0, off, s[0:3], src_scc offset:4095 ; encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x00,0x00,0xfd] +// GFX12XX: buffer_atomic_add_u32 v0, off, s[0:3], src_scc offset:4095 ; encoding: [0x7d,0x40,0x0d,0xc4,0x00,0x00,0x80,0x00,0x00,0xff,0x0f,0x00] +// GFX89: buffer_atomic_add v0, off, s[0:3], src_scc offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x00,0x00,0xfd] +// SICI: buffer_atomic_add v0, off, s[0:3], src_scc offset:4095 ; encoding: [0xff,0x0f,0xc8,0xe0,0x00,0x00,0x00,0xfd] s_add_i32 s0, vccz, s0 -// SICI: s_add_i32 s0, src_vccz, s0 ; encoding: [0xfb,0x00,0x00,0x81] // GFX89: s_add_i32 s0, src_vccz, s0 ; encoding: [0xfb,0x00,0x00,0x81] -// NOGFX11: :[[@LINE-3]]:15: error: src_vccz register not available on this GPU -// NOGFX12: :[[@LINE-4]]:15: error: src_vccz register not available on this GPU -// NOGFX1250: :[[@LINE-5]]:15: error: src_vccz register not available on this GPU +// NOGFX11: :[[@LINE-2]]:15: error: src_vccz register not available on this GPU +// NOGFX12: :[[@LINE-3]]:15: error: src_vccz register not available on this GPU +// NOGFX1250: :[[@LINE-4]]:15: error: src_vccz register not available on this GPU +// SICI: s_add_i32 s0, src_vccz, s0 ; encoding: [0xfb,0x00,0x00,0x81] s_add_i32 s0, execz, s0 -// SICI: s_add_i32 s0, src_execz, s0 ; encoding: [0xfc,0x00,0x00,0x81] // GFX89: s_add_i32 s0, src_execz, s0 ; encoding: [0xfc,0x00,0x00,0x81] -// NOGFX11: :[[@LINE-3]]:15: error: src_execz register not available on this GPU -// NOGFX12: :[[@LINE-4]]:15: error: src_execz register not available on this GPU -// NOGFX1250: :[[@LINE-5]]:15: error: src_execz register not available on this GPU +// NOGFX11: :[[@LINE-2]]:15: error: src_execz register not available on this GPU +// NOGFX12: :[[@LINE-3]]:15: error: src_execz register not available on this GPU +// NOGFX1250: :[[@LINE-4]]:15: error: src_execz register not available on this GPU +// SICI: s_add_i32 s0, src_execz, s0 ; encoding: [0xfc,0x00,0x00,0x81] s_add_i32 s0, scc, s0 -// SICI: s_add_i32 s0, src_scc, s0 ; encoding: [0xfd,0x00,0x00,0x81] -// GFX89: s_add_i32 s0, src_scc, s0 ; encoding: [0xfd,0x00,0x00,0x81] -// GFX12XX: s_add_co_i32 s0, src_scc, s0 ; encoding: [0xfd,0x00,0x00,0x81] // GFX11: s_add_i32 s0, src_scc, s0 ; encoding: [0xfd,0x00,0x00,0x81] +// GFX12XX: s_add_co_i32 s0, src_scc, s0 ; encoding: [0xfd,0x00,0x00,0x81] +// GFX89: s_add_i32 s0, src_scc, s0 ; encoding: [0xfd,0x00,0x00,0x81] +// SICI: s_add_i32 s0, src_scc, s0 ; encoding: [0xfd,0x00,0x00,0x81] s_and_b64 s[0:1], s[0:1], src_vccz -// SICI: s_and_b64 s[0:1], s[0:1], src_vccz ; encoding: [0x00,0xfb,0x80,0x87] // GFX89: s_and_b64 s[0:1], s[0:1], src_vccz ; encoding: [0x00,0xfb,0x80,0x86] -// NOGFX11: :[[@LINE-3]]:27: error: src_vccz register not available on this GPU -// NOGFX12: :[[@LINE-4]]:27: error: src_vccz register not available on this GPU -// NOGFX1250: :[[@LINE-5]]:27: error: src_vccz register not available on this GPU +// NOGFX11: :[[@LINE-2]]:27: error: src_vccz register not available on this GPU +// NOGFX12: :[[@LINE-3]]:27: error: src_vccz register not available on this GPU +// NOGFX1250: :[[@LINE-4]]:27: error: src_vccz register not available on this GPU +// SICI: s_and_b64 s[0:1], s[0:1], src_vccz ; encoding: [0x00,0xfb,0x80,0x87] s_and_b64 s[0:1], s[0:1], src_execz -// SICI: s_and_b64 s[0:1], s[0:1], src_execz ; encoding: [0x00,0xfc,0x80,0x87] // GFX89: s_and_b64 s[0:1], s[0:1], src_execz ; encoding: [0x00,0xfc,0x80,0x86] -// NOGFX11: :[[@LINE-3]]:27: error: src_execz register not available on this GPU -// NOGFX12: :[[@LINE-4]]:27: error: src_execz register not available on this GPU -// NOGFX1250: :[[@LINE-5]]:27: error: src_execz register not available on this GPU +// NOGFX11: :[[@LINE-2]]:27: error: src_execz register not available on this GPU +// NOGFX12: :[[@LINE-3]]:27: error: src_execz register not available on this GPU +// NOGFX1250: :[[@LINE-4]]:27: error: src_execz register not available on this GPU +// SICI: s_and_b64 s[0:1], s[0:1], src_execz ; encoding: [0x00,0xfc,0x80,0x87] s_and_b64 s[0:1], s[0:1], src_scc -// SICI: s_and_b64 s[0:1], s[0:1], src_scc ; encoding: [0x00,0xfd,0x80,0x87] -// GFX89: s_and_b64 s[0:1], s[0:1], src_scc ; encoding: [0x00,0xfd,0x80,0x86] -// GFX12XX: s_and_b64 s[0:1], s[0:1], src_scc ; encoding: [0x00,0xfd,0x80,0x8b] // GFX11: s_and_b64 s[0:1], s[0:1], src_scc ; encoding: [0x00,0xfd,0x80,0x8b] +// GFX12XX: s_and_b64 s[0:1], s[0:1], src_scc ; encoding: [0x00,0xfd,0x80,0x8b] +// GFX89: s_and_b64 s[0:1], s[0:1], src_scc ; encoding: [0x00,0xfd,0x80,0x86] +// SICI: s_and_b64 s[0:1], s[0:1], src_scc ; encoding: [0x00,0xfd,0x80,0x87] v_add_u16 v0, vccz, v0 // GFX89: v_add_u16_e32 v0, src_vccz, v0 ; encoding: [0xfb,0x00,0x00,0x4c] -// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU -// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU // NOSICIVI: :[[@LINE-2]]:1: error: instruction not supported on this GPU v_add_u16_sdwa v0, scc, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX9: v_add_u16_sdwa v0, src_scc, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x00,0x00,0x4c,0xfd,0x06,0x86,0x06] -// NOVI: :[[@LINE-3]]:20: error: invalid operand for instruction -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-6]]:20: error: invalid operand for instruction // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_add_u16_sdwa v0, v0, scc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX9: v_add_u16_sdwa v0, v0, src_scc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0xfa,0x01,0x4c,0x00,0x06,0x06,0x86] -// NOVI: :[[@LINE-3]]:24: error: invalid operand for instruction -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-6]]:24: error: invalid operand for instruction // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_add_u32 v0, execz, v0 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX9: v_add_u32_e32 v0, src_execz, v0 ; encoding: [0xfc,0x00,0x00,0x68] -// NOVI: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode -// NOGFX11: :[[@LINE-4]]:15: error: src_execz register not available on this GPU -// NOGFX12: :[[@LINE-5]]:15: error: src_execz register not available on this GPU -// NOGFX1250: :[[@LINE-6]]:15: error: src_execz register not available on this GPU +// NOGFX11: :[[@LINE-2]]:15: error: src_execz register not available on this GPU +// NOGFX12: :[[@LINE-3]]:15: error: src_execz register not available on this GPU +// NOGFX1250: :[[@LINE-4]]:15: error: src_execz register not available on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-6]]:1: error: operands are not valid for this GPU or mode // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_add_u32_e64 v0, scc, v0 +// GFX11: v_add_nc_u32_e64 v0, src_scc, v0 ; encoding: [0x00,0x00,0x25,0xd5,0xfd,0x00,0x02,0x00] // GFX12XX: v_add_nc_u32_e64 v0, src_scc, v0 ; encoding: [0x00,0x00,0x25,0xd5,0xfd,0x00,0x02,0x00] -// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX9: v_add_u32_e64 v0, src_scc, v0 ; encoding: [0x00,0x00,0x34,0xd1,0xfd,0x00,0x02,0x00] -// GFX11: v_add_nc_u32_e64 v0, src_scc, v0 ; encoding: [0x00,0x00,0x25,0xd5,0xfd,0x00,0x02,0x00] +// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU // NOVI: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_cmp_eq_i64 vcc, scc, v[0:1] -// SICI: v_cmp_eq_i64_e32 vcc, src_scc, v[0:1] ; encoding: [0xfd,0x00,0x44,0x7d] // GFX89: v_cmp_eq_i64_e32 vcc, src_scc, v[0:1] ; encoding: [0xfd,0x00,0xc4,0x7d] -// NOGFX11: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode -// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode -// NOGFX1250: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode +// NOGFX11: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode +// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode +// NOGFX1250: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode +// SICI: v_cmp_eq_i64_e32 vcc, src_scc, v[0:1] ; encoding: [0xfd,0x00,0x44,0x7d] v_max_f16 v0, execz, v0 // GFX89: v_max_f16_e32 v0, src_execz, v0 ; encoding: [0xfc,0x00,0x00,0x5a] -// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU -// NOGFX11: :[[@LINE-3]]:15: error: src_execz register not available on this GPU -// NOGFX12: :[[@LINE-4]]:15: error: src_execz register not available on this GPU -// NOGFX1250: :[[@LINE-5]]:15: error: src_execz register not available on this GPU +// NOGFX11: :[[@LINE-2]]:15: error: src_execz register not available on this GPU +// NOGFX12: :[[@LINE-3]]:15: error: src_execz register not available on this GPU +// NOGFX1250: :[[@LINE-4]]:15: error: src_execz register not available on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU // NOSICIVI: :[[@LINE-2]]:1: error: instruction not supported on this GPU v_max_f32 v0, vccz, v0 -// SICI: v_max_f32_e32 v0, src_vccz, v0 ; encoding: [0xfb,0x00,0x00,0x20] // GFX89: v_max_f32_e32 v0, src_vccz, v0 ; encoding: [0xfb,0x00,0x00,0x16] -// NOGFX11: :[[@LINE-3]]:15: error: src_vccz register not available on this GPU -// NOGFX12: :[[@LINE-4]]:15: error: src_vccz register not available on this GPU -// NOGFX1250: :[[@LINE-5]]:15: error: src_vccz register not available on this GPU +// NOGFX11: :[[@LINE-2]]:15: error: src_vccz register not available on this GPU +// NOGFX12: :[[@LINE-3]]:15: error: src_vccz register not available on this GPU +// NOGFX1250: :[[@LINE-4]]:15: error: src_vccz register not available on this GPU +// SICI: v_max_f32_e32 v0, src_vccz, v0 ; encoding: [0xfb,0x00,0x00,0x20] v_max_f64 v[0:1], scc, v[0:1] -// SICI: v_max_f64 v[0:1], src_scc, v[0:1] ; encoding: [0x00,0x00,0xce,0xd2,0xfd,0x00,0x02,0x00] -// GFX89: v_max_f64 v[0:1], src_scc, v[0:1] ; encoding: [0x00,0x00,0x83,0xd2,0xfd,0x00,0x02,0x00] -// GFX12XX: v_max_num_f64_e32 v[0:1], src_scc, v[0:1] ; encoding: [0xfd,0x00,0x00,0x1c] // GFX11: v_max_f64 v[0:1], src_scc, v[0:1] ; encoding: [0x00,0x00,0x2a,0xd7,0xfd,0x00,0x02,0x00] +// GFX12XX: v_max_num_f64_e32 v[0:1], src_scc, v[0:1] ; encoding: [0xfd,0x00,0x00,0x1c] +// GFX89: v_max_f64 v[0:1], src_scc, v[0:1] ; encoding: [0x00,0x00,0x83,0xd2,0xfd,0x00,0x02,0x00] +// SICI: v_max_f64 v[0:1], src_scc, v[0:1] ; encoding: [0x00,0x00,0xce,0xd2,0xfd,0x00,0x02,0x00] v_pk_add_f16 v0, execz, v0 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX9: v_pk_add_f16 v0, src_execz, v0 ; encoding: [0x00,0x40,0x8f,0xd3,0xfc,0x00,0x02,0x18] -// NOVI: :[[@LINE-3]]:1: error: instruction not supported on this GPU -// NOGFX11: :[[@LINE-4]]:18: error: src_execz register not available on this GPU -// NOGFX12: :[[@LINE-5]]:18: error: src_execz register not available on this GPU -// NOGFX1250: :[[@LINE-6]]:18: error: src_execz register not available on this GPU +// NOGFX11: :[[@LINE-2]]:18: error: src_execz register not available on this GPU +// NOGFX12: :[[@LINE-3]]:18: error: src_execz register not available on this GPU +// NOGFX1250: :[[@LINE-4]]:18: error: src_execz register not available on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-6]]:1: error: instruction not supported on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_ceil_f16 v0, neg(vccz) // GFX89: v_ceil_f16_e64 v0, -src_vccz ; encoding: [0x00,0x00,0x85,0xd1,0xfb,0x00,0x00,0x20] -// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU -// NOGFX11: :[[@LINE-3]]:20: error: src_vccz register not available on this GPU -// NOGFX12: :[[@LINE-4]]:20: error: src_vccz register not available on this GPU -// NOGFX1250: :[[@LINE-5]]:20: error: src_vccz register not available on this GPU +// NOGFX11: :[[@LINE-2]]:20: error: src_vccz register not available on this GPU +// NOGFX12: :[[@LINE-3]]:20: error: src_vccz register not available on this GPU +// NOGFX1250: :[[@LINE-4]]:20: error: src_vccz register not available on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU // NOSICIVI: :[[@LINE-2]]:1: error: instruction not supported on this GPU v_ceil_f16 v0, abs(scc) -// GFX89: v_ceil_f16_e64 v0, |src_scc| ; encoding: [0x00,0x01,0x85,0xd1,0xfd,0x00,0x00,0x00] -// GFX12XX: v_ceil_f16_e64 v0, |src_scc| ; encoding: [0x00,0x01,0xdc,0xd5,0xfd,0x00,0x00,0x00] -// NOSICI: :[[@LINE-3]]:1: error: instruction not supported on this GPU // GFX11: v_ceil_f16_e64 v0, |src_scc| ; encoding: [0x00,0x01,0xdc,0xd5,0xfd,0x00,0x00,0x00] +// GFX12XX: v_ceil_f16_e64 v0, |src_scc| ; encoding: [0x00,0x01,0xdc,0xd5,0xfd,0x00,0x00,0x00] +// GFX89: v_ceil_f16_e64 v0, |src_scc| ; encoding: [0x00,0x01,0x85,0xd1,0xfd,0x00,0x00,0x00] +// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU // NOSICIVI: :[[@LINE-2]]:1: error: instruction not supported on this GPU v_ceil_f64 v[5:6], |execz| -// GFX89: v_ceil_f64_e64 v[5:6], |src_execz| ; encoding: [0x05,0x01,0x58,0xd1,0xfc,0x00,0x00,0x00] // CI: v_ceil_f64_e64 v[5:6], |src_execz| ; encoding: [0x05,0x01,0x30,0xd3,0xfc,0x00,0x00,0x00] -// NOSI: :[[@LINE-3]]:1: error: instruction not supported on this GPU -// NOGFX11: :[[@LINE-4]]:21: error: src_execz register not available on this GPU -// NOGFX12: :[[@LINE-5]]:21: error: src_execz register not available on this GPU -// NOGFX1250: :[[@LINE-6]]:21: error: src_execz register not available on this GPU +// GFX89: v_ceil_f64_e64 v[5:6], |src_execz| ; encoding: [0x05,0x01,0x58,0xd1,0xfc,0x00,0x00,0x00] +// NOGFX11: :[[@LINE-3]]:21: error: src_execz register not available on this GPU +// NOGFX12: :[[@LINE-4]]:21: error: src_execz register not available on this GPU +// NOGFX1250: :[[@LINE-5]]:21: error: src_execz register not available on this GPU +// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU v_ceil_f64 v[5:6], -vcc -// GFX89: v_ceil_f64_e64 v[5:6], -vcc ; encoding: [0x05,0x00,0x58,0xd1,0x6a,0x00,0x00,0x20] // CI: v_ceil_f64_e64 v[5:6], -vcc ; encoding: [0x05,0x00,0x30,0xd3,0x6a,0x00,0x00,0x20] // GFX11: v_ceil_f64_e64 v[5:6], -vcc ; encoding: [0x05,0x00,0x98,0xd5,0x6a,0x00,0x00,0x20] // GFX12: v_ceil_f64_e64 v[5:6], -vcc ; encoding: [0x05,0x00,0x98,0xd5,0x6a,0x00,0x00,0x20] -// NOSI: :[[@LINE-5]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-6]]:12: error: invalid operand for instruction +// GFX89: v_ceil_f64_e64 v[5:6], -vcc ; encoding: [0x05,0x00,0x58,0xd1,0x6a,0x00,0x00,0x20] +// NOGFX1250: :[[@LINE-5]]:12: error: invalid operand for instruction +// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU v_ceil_f32 v0, -vccz -// SICI: v_ceil_f32_e64 v0, -src_vccz ; encoding: [0x00,0x00,0x44,0xd3,0xfb,0x00,0x00,0x20] // GFX89: v_ceil_f32_e64 v0, -src_vccz ; encoding: [0x00,0x00,0x5d,0xd1,0xfb,0x00,0x00,0x20] -// NOGFX11: :[[@LINE-3]]:17: error: src_vccz register not available on this GPU -// NOGFX12: :[[@LINE-4]]:17: error: src_vccz register not available on this GPU -// NOGFX1250: :[[@LINE-5]]:17: error: src_vccz register not available on this GPU +// NOGFX11: :[[@LINE-2]]:17: error: src_vccz register not available on this GPU +// NOGFX12: :[[@LINE-3]]:17: error: src_vccz register not available on this GPU +// NOGFX1250: :[[@LINE-4]]:17: error: src_vccz register not available on this GPU +// SICI: v_ceil_f32_e64 v0, -src_vccz ; encoding: [0x00,0x00,0x44,0xd3,0xfb,0x00,0x00,0x20] v_ceil_f32 v0, |execz| -// SICI: v_ceil_f32_e64 v0, |src_execz| ; encoding: [0x00,0x01,0x44,0xd3,0xfc,0x00,0x00,0x00] // GFX89: v_ceil_f32_e64 v0, |src_execz| ; encoding: [0x00,0x01,0x5d,0xd1,0xfc,0x00,0x00,0x00] -// NOGFX11: :[[@LINE-3]]:17: error: src_execz register not available on this GPU -// NOGFX12: :[[@LINE-4]]:17: error: src_execz register not available on this GPU -// NOGFX1250: :[[@LINE-5]]:17: error: src_execz register not available on this GPU +// NOGFX11: :[[@LINE-2]]:17: error: src_execz register not available on this GPU +// NOGFX12: :[[@LINE-3]]:17: error: src_execz register not available on this GPU +// NOGFX1250: :[[@LINE-4]]:17: error: src_execz register not available on this GPU +// SICI: v_ceil_f32_e64 v0, |src_execz| ; encoding: [0x00,0x01,0x44,0xd3,0xfc,0x00,0x00,0x00] v_ceil_f16_sdwa v5, |vccz| dst_sel:DWORD dst_unused:UNUSED_PRESERVE -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX9: v_ceil_f16_sdwa v5, |src_vccz| dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x8a,0x0a,0x7e,0xfb,0x16,0xa6,0x00] -// NOVI: :[[@LINE-3]]:22: error: invalid operand for instruction -// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported -// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported -// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported +// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported +// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported +// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-6]]:22: error: invalid operand for instruction // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_ceil_f16_sdwa v5, -scc dst_sel:DWORD dst_unused:UNUSED_PRESERVE -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX9: v_ceil_f16_sdwa v5, -src_scc dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x8a,0x0a,0x7e,0xfd,0x16,0x96,0x00] -// NOVI: :[[@LINE-3]]:22: error: invalid operand for instruction -// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported -// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported -// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported +// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported +// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported +// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-6]]:22: error: invalid operand for instruction // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_ceil_f32_sdwa v5, vccz dst_sel:DWORD src0_sel:DWORD -// NOSICI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported // GFX9: v_ceil_f32_sdwa v5, src_vccz dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x3a,0x0a,0x7e,0xfb,0x16,0x86,0x00] -// NOVI: :[[@LINE-3]]:21: error: invalid operand for instruction -// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported -// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported -// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported +// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported +// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported +// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported +// NOSICI: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported +// NOVI: :[[@LINE-6]]:21: error: invalid operand for instruction // NOSICIVI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported v_ceil_f32_sdwa v5, |execz| dst_sel:DWORD src0_sel:DWORD -// NOSICI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported // GFX9: v_ceil_f32_sdwa v5, |src_execz| dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x3a,0x0a,0x7e,0xfc,0x16,0xa6,0x00] -// NOVI: :[[@LINE-3]]:22: error: invalid operand for instruction -// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported -// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported -// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported +// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported +// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported +// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported +// NOSICI: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported +// NOVI: :[[@LINE-6]]:22: error: invalid operand for instruction // NOSICIVI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported //---------------------------------------------------------------------------// @@ -1317,266 +1317,266 @@ v_ceil_f32_sdwa v5, |execz| dst_sel:DWORD src0_sel:DWORD //---------------------------------------------------------------------------// buffer_atomic_add v0, off, s[0:3], src_shared_base offset:4095 -// NOSICI: :[[@LINE-1]]:36: error: src_shared_base register not available on this GPU -// GFX9: buffer_atomic_add v0, off, s[0:3], src_shared_base offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x00,0x00,0xeb] // GFX11: buffer_atomic_add_u32 v0, off, s[0:3], src_shared_base offset:4095 ; encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x00,0x00,0xeb] -// NOVI: :[[@LINE-4]]:36: error: src_shared_base register not available on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode -// NOGFX1250: :[[@LINE-6]]:1: error: operands are not valid for this GPU or mode +// GFX9: buffer_atomic_add v0, off, s[0:3], src_shared_base offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x00,0x00,0xeb] +// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode +// NOGFX1250: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode +// NOSICI: :[[@LINE-5]]:36: error: src_shared_base register not available on this GPU +// NOVI: :[[@LINE-6]]:36: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:36: error: src_shared_base register not available on this GPU s_add_i32 s0, src_shared_base, s0 +// GFX11: s_add_i32 s0, src_shared_base, s0 ; encoding: [0xeb,0x00,0x00,0x81] // GFX12XX: s_add_co_i32 s0, src_shared_base, s0 ; encoding: [0xeb,0x00,0x00,0x81] -// NOSICI: :[[@LINE-2]]:15: error: src_shared_base register not available on this GPU // GFX9: s_add_i32 s0, src_shared_base, s0 ; encoding: [0xeb,0x00,0x00,0x81] -// GFX11: s_add_i32 s0, src_shared_base, s0 ; encoding: [0xeb,0x00,0x00,0x81] +// NOSICI: :[[@LINE-4]]:15: error: src_shared_base register not available on this GPU // NOVI: :[[@LINE-5]]:15: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:15: error: src_shared_base register not available on this GPU s_add_i32 s0, src_shared_limit, s0 +// GFX11: s_add_i32 s0, src_shared_limit, s0 ; encoding: [0xec,0x00,0x00,0x81] // GFX12XX: s_add_co_i32 s0, src_shared_limit, s0 ; encoding: [0xec,0x00,0x00,0x81] -// NOSICI: :[[@LINE-2]]:15: error: src_shared_limit register not available on this GPU // GFX9: s_add_i32 s0, src_shared_limit, s0 ; encoding: [0xec,0x00,0x00,0x81] -// GFX11: s_add_i32 s0, src_shared_limit, s0 ; encoding: [0xec,0x00,0x00,0x81] +// NOSICI: :[[@LINE-4]]:15: error: src_shared_limit register not available on this GPU // NOVI: :[[@LINE-5]]:15: error: src_shared_limit register not available on this GPU // NOSICIVI: :[[@LINE-1]]:15: error: src_shared_limit register not available on this GPU s_add_i32 s0, src_private_base, s0 +// GFX11: s_add_i32 s0, src_private_base, s0 ; encoding: [0xed,0x00,0x00,0x81] // GFX12XX: s_add_co_i32 s0, src_private_base, s0 ; encoding: [0xed,0x00,0x00,0x81] -// NOSICI: :[[@LINE-2]]:15: error: src_private_base register not available on this GPU // GFX9: s_add_i32 s0, src_private_base, s0 ; encoding: [0xed,0x00,0x00,0x81] -// GFX11: s_add_i32 s0, src_private_base, s0 ; encoding: [0xed,0x00,0x00,0x81] +// NOSICI: :[[@LINE-4]]:15: error: src_private_base register not available on this GPU // NOVI: :[[@LINE-5]]:15: error: src_private_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:15: error: src_private_base register not available on this GPU s_add_i32 s0, src_private_limit, s0 +// GFX11: s_add_i32 s0, src_private_limit, s0 ; encoding: [0xee,0x00,0x00,0x81] // GFX12XX: s_add_co_i32 s0, src_private_limit, s0 ; encoding: [0xee,0x00,0x00,0x81] -// NOSICI: :[[@LINE-2]]:15: error: src_private_limit register not available on this GPU // GFX9: s_add_i32 s0, src_private_limit, s0 ; encoding: [0xee,0x00,0x00,0x81] -// GFX11: s_add_i32 s0, src_private_limit, s0 ; encoding: [0xee,0x00,0x00,0x81] +// NOSICI: :[[@LINE-4]]:15: error: src_private_limit register not available on this GPU // NOVI: :[[@LINE-5]]:15: error: src_private_limit register not available on this GPU // NOSICIVI: :[[@LINE-1]]:15: error: src_private_limit register not available on this GPU s_add_i32 s0, src_pops_exiting_wave_id, s0 -// NOSICI: :[[@LINE-1]]:15: error: src_pops_exiting_wave_id register not available on this GPU // GFX9: s_add_i32 s0, src_pops_exiting_wave_id, s0 ; encoding: [0xef,0x00,0x00,0x81] -// NOVI: :[[@LINE-3]]:15: error: src_pops_exiting_wave_id register not available on this GPU -// NOGFX11: :[[@LINE-4]]:15: error: src_pops_exiting_wave_id register not available on this GPU -// NOGFX12: :[[@LINE-5]]:15: error: src_pops_exiting_wave_id register not available on this GPU -// NOGFX1250: :[[@LINE-6]]:15: error: src_pops_exiting_wave_id register not available on this GPU +// NOGFX11: :[[@LINE-2]]:15: error: src_pops_exiting_wave_id register not available on this GPU +// NOGFX12: :[[@LINE-3]]:15: error: src_pops_exiting_wave_id register not available on this GPU +// NOGFX1250: :[[@LINE-4]]:15: error: src_pops_exiting_wave_id register not available on this GPU +// NOSICI: :[[@LINE-5]]:15: error: src_pops_exiting_wave_id register not available on this GPU +// NOVI: :[[@LINE-6]]:15: error: src_pops_exiting_wave_id register not available on this GPU // NOSICIVI: :[[@LINE-1]]:15: error: src_pops_exiting_wave_id register not available on this GPU s_and_b64 s[0:1], s[0:1], src_shared_base +// GFX11: s_and_b64 s[0:1], s[0:1], src_shared_base ; encoding: [0x00,0xeb,0x80,0x8b] // GFX12XX: s_and_b64 s[0:1], s[0:1], src_shared_base ; encoding: [0x00,0xeb,0x80,0x8b] -// NOSICI: :[[@LINE-2]]:27: error: src_shared_base register not available on this GPU // GFX9: s_and_b64 s[0:1], s[0:1], src_shared_base ; encoding: [0x00,0xeb,0x80,0x86] -// GFX11: s_and_b64 s[0:1], s[0:1], src_shared_base ; encoding: [0x00,0xeb,0x80,0x8b] +// NOSICI: :[[@LINE-4]]:27: error: src_shared_base register not available on this GPU // NOVI: :[[@LINE-5]]:27: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:27: error: src_shared_base register not available on this GPU s_and_b64 s[0:1], s[0:1], src_shared_limit +// GFX11: s_and_b64 s[0:1], s[0:1], src_shared_limit ; encoding: [0x00,0xec,0x80,0x8b] // GFX12XX: s_and_b64 s[0:1], s[0:1], src_shared_limit ; encoding: [0x00,0xec,0x80,0x8b] -// NOSICI: :[[@LINE-2]]:27: error: src_shared_limit register not available on this GPU // GFX9: s_and_b64 s[0:1], s[0:1], src_shared_limit ; encoding: [0x00,0xec,0x80,0x86] -// GFX11: s_and_b64 s[0:1], s[0:1], src_shared_limit ; encoding: [0x00,0xec,0x80,0x8b] +// NOSICI: :[[@LINE-4]]:27: error: src_shared_limit register not available on this GPU // NOVI: :[[@LINE-5]]:27: error: src_shared_limit register not available on this GPU // NOSICIVI: :[[@LINE-1]]:27: error: src_shared_limit register not available on this GPU s_and_b64 s[0:1], s[0:1], src_private_base +// GFX11: s_and_b64 s[0:1], s[0:1], src_private_base ; encoding: [0x00,0xed,0x80,0x8b] // GFX12XX: s_and_b64 s[0:1], s[0:1], src_private_base ; encoding: [0x00,0xed,0x80,0x8b] -// NOSICI: :[[@LINE-2]]:27: error: src_private_base register not available on this GPU // GFX9: s_and_b64 s[0:1], s[0:1], src_private_base ; encoding: [0x00,0xed,0x80,0x86] -// GFX11: s_and_b64 s[0:1], s[0:1], src_private_base ; encoding: [0x00,0xed,0x80,0x8b] +// NOSICI: :[[@LINE-4]]:27: error: src_private_base register not available on this GPU // NOVI: :[[@LINE-5]]:27: error: src_private_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:27: error: src_private_base register not available on this GPU s_and_b64 s[0:1], s[0:1], src_private_limit +// GFX11: s_and_b64 s[0:1], s[0:1], src_private_limit ; encoding: [0x00,0xee,0x80,0x8b] // GFX12XX: s_and_b64 s[0:1], s[0:1], src_private_limit ; encoding: [0x00,0xee,0x80,0x8b] -// NOSICI: :[[@LINE-2]]:27: error: src_private_limit register not available on this GPU // GFX9: s_and_b64 s[0:1], s[0:1], src_private_limit ; encoding: [0x00,0xee,0x80,0x86] -// GFX11: s_and_b64 s[0:1], s[0:1], src_private_limit ; encoding: [0x00,0xee,0x80,0x8b] +// NOSICI: :[[@LINE-4]]:27: error: src_private_limit register not available on this GPU // NOVI: :[[@LINE-5]]:27: error: src_private_limit register not available on this GPU // NOSICIVI: :[[@LINE-1]]:27: error: src_private_limit register not available on this GPU s_and_b64 s[0:1], s[0:1], src_pops_exiting_wave_id -// NOSICI: :[[@LINE-1]]:27: error: src_pops_exiting_wave_id register not available on this GPU // GFX9: s_and_b64 s[0:1], s[0:1], src_pops_exiting_wave_id ; encoding: [0x00,0xef,0x80,0x86] -// NOVI: :[[@LINE-3]]:27: error: src_pops_exiting_wave_id register not available on this GPU -// NOGFX11: :[[@LINE-4]]:27: error: src_pops_exiting_wave_id register not available on this GPU -// NOGFX12: :[[@LINE-5]]:27: error: src_pops_exiting_wave_id register not available on this GPU -// NOGFX1250: :[[@LINE-6]]:27: error: src_pops_exiting_wave_id register not available on this GPU +// NOGFX11: :[[@LINE-2]]:27: error: src_pops_exiting_wave_id register not available on this GPU +// NOGFX12: :[[@LINE-3]]:27: error: src_pops_exiting_wave_id register not available on this GPU +// NOGFX1250: :[[@LINE-4]]:27: error: src_pops_exiting_wave_id register not available on this GPU +// NOSICI: :[[@LINE-5]]:27: error: src_pops_exiting_wave_id register not available on this GPU +// NOVI: :[[@LINE-6]]:27: error: src_pops_exiting_wave_id register not available on this GPU // NOSICIVI: :[[@LINE-1]]:27: error: src_pops_exiting_wave_id register not available on this GPU v_add_u16 v0, src_shared_base, v0 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX9: v_add_u16_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x4c] -// NOVI: :[[@LINE-3]]:15: error: src_shared_base register not available on this GPU -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-6]]:15: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_add_u16_sdwa v0, src_shared_base, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX9: v_add_u16_sdwa v0, src_shared_base, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x00,0x00,0x4c,0xeb,0x06,0x86,0x06] -// NOVI: :[[@LINE-3]]:20: error: src_shared_base register not available on this GPU -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-6]]:20: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_add_u16_sdwa v0, v0, src_shared_base dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX9: v_add_u16_sdwa v0, v0, src_shared_base dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0xd6,0x01,0x4c,0x00,0x06,0x06,0x86] -// NOVI: :[[@LINE-3]]:24: error: src_shared_base register not available on this GPU -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-6]]:24: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_add_u32 v0, src_shared_base, v0 +// GFX11: v_add_nc_u32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x4a] // GFX12XX: v_add_nc_u32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x4a] -// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX9: v_add_u32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x68] -// GFX11: v_add_nc_u32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x4a] +// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU // NOVI: :[[@LINE-5]]:15: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_add_u32_e64 v0, src_shared_base, v0 +// GFX11: v_add_nc_u32_e64 v0, src_shared_base, v0 ; encoding: [0x00,0x00,0x25,0xd5,0xeb,0x00,0x02,0x00] // GFX12XX: v_add_nc_u32_e64 v0, src_shared_base, v0 ; encoding: [0x00,0x00,0x25,0xd5,0xeb,0x00,0x02,0x00] -// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX9: v_add_u32_e64 v0, src_shared_base, v0 ; encoding: [0x00,0x00,0x34,0xd1,0xeb,0x00,0x02,0x00] -// GFX11: v_add_nc_u32_e64 v0, src_shared_base, v0 ; encoding: [0x00,0x00,0x25,0xd5,0xeb,0x00,0x02,0x00] +// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU // NOVI: :[[@LINE-5]]:19: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_cmp_eq_i64 vcc, src_shared_base, v[0:1] -// NOSICI: :[[@LINE-1]]:19: error: src_shared_base register not available on this GPU // GFX9: v_cmp_eq_i64_e32 vcc, src_shared_base, v[0:1] ; encoding: [0xeb,0x00,0xc4,0x7d] -// NOVI: :[[@LINE-3]]:19: error: src_shared_base register not available on this GPU -// NOGFX11: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode -// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode -// NOGFX1250: :[[@LINE-6]]:1: error: operands are not valid for this GPU or mode +// NOGFX11: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode +// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode +// NOGFX1250: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode +// NOSICI: :[[@LINE-5]]:19: error: src_shared_base register not available on this GPU +// NOVI: :[[@LINE-6]]:19: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:19: error: src_shared_base register not available on this GPU v_max_f16 v0, src_shared_base, v0 +// GFX11: v_max_f16_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x72] // GFX12XX: v_max_num_f16_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x62] -// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX9: v_max_f16_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x5a] -// GFX11: v_max_f16_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x72] +// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU // NOVI: :[[@LINE-5]]:15: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_max_f32 v0, src_shared_base, v0 +// GFX11: v_max_f32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x20] // GFX12XX: v_max_num_f32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x2c] -// NOSICI: :[[@LINE-2]]:15: error: src_shared_base register not available on this GPU // GFX9: v_max_f32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x16] -// GFX11: v_max_f32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x20] +// NOSICI: :[[@LINE-4]]:15: error: src_shared_base register not available on this GPU // NOVI: :[[@LINE-5]]:15: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:15: error: src_shared_base register not available on this GPU v_max_f64 v[0:1], src_shared_base, v[0:1] +// GFX11: v_max_f64 v[0:1], src_shared_base, v[0:1] ; encoding: [0x00,0x00,0x2a,0xd7,0xeb,0x00,0x02,0x00] // GFX12XX: v_max_num_f64_e32 v[0:1], src_shared_base, v[0:1] ; encoding: [0xeb,0x00,0x00,0x1c] -// NOSICI: :[[@LINE-2]]:19: error: src_shared_base register not available on this GPU // GFX9: v_max_f64 v[0:1], src_shared_base, v[0:1] ; encoding: [0x00,0x00,0x83,0xd2,0xeb,0x00,0x02,0x00] -// GFX11: v_max_f64 v[0:1], src_shared_base, v[0:1] ; encoding: [0x00,0x00,0x2a,0xd7,0xeb,0x00,0x02,0x00] +// NOSICI: :[[@LINE-4]]:19: error: src_shared_base register not available on this GPU // NOVI: :[[@LINE-5]]:19: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:19: error: src_shared_base register not available on this GPU v_pk_add_f16 v0, src_shared_base, v0 +// GFX11: v_pk_add_f16 v0, src_shared_base, v0 ; encoding: [0x00,0x40,0x0f,0xcc,0xeb,0x00,0x02,0x18] // GFX12XX: v_pk_add_f16 v0, src_shared_base, v0 ; encoding: [0x00,0x40,0x0f,0xcc,0xeb,0x00,0x02,0x18] -// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX9: v_pk_add_f16 v0, src_shared_base, v0 ; encoding: [0x00,0x40,0x8f,0xd3,0xeb,0x00,0x02,0x18] -// GFX11: v_pk_add_f16 v0, src_shared_base, v0 ; encoding: [0x00,0x40,0x0f,0xcc,0xeb,0x00,0x02,0x18] +// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU // NOVI: :[[@LINE-5]]:1: error: instruction not supported on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_ceil_f16 v0, neg(src_shared_base) +// GFX11: v_ceil_f16_e64 v0, -src_shared_base ; encoding: [0x00,0x00,0xdc,0xd5,0xeb,0x00,0x00,0x20] // GFX12XX: v_ceil_f16_e64 v0, -src_shared_base ; encoding: [0x00,0x00,0xdc,0xd5,0xeb,0x00,0x00,0x20] -// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX9: v_ceil_f16_e64 v0, -src_shared_base ; encoding: [0x00,0x00,0x85,0xd1,0xeb,0x00,0x00,0x20] -// GFX11: v_ceil_f16_e64 v0, -src_shared_base ; encoding: [0x00,0x00,0xdc,0xd5,0xeb,0x00,0x00,0x20] +// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU // NOVI: :[[@LINE-5]]:20: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_ceil_f16 v0, abs(src_shared_base) +// GFX11: v_ceil_f16_e64 v0, |src_shared_base| ; encoding: [0x00,0x01,0xdc,0xd5,0xeb,0x00,0x00,0x00] // GFX12XX: v_ceil_f16_e64 v0, |src_shared_base| ; encoding: [0x00,0x01,0xdc,0xd5,0xeb,0x00,0x00,0x00] -// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX9: v_ceil_f16_e64 v0, |src_shared_base| ; encoding: [0x00,0x01,0x85,0xd1,0xeb,0x00,0x00,0x00] -// GFX11: v_ceil_f16_e64 v0, |src_shared_base| ; encoding: [0x00,0x01,0xdc,0xd5,0xeb,0x00,0x00,0x00] +// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU // NOVI: :[[@LINE-5]]:20: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_ceil_f64 v[5:6], |src_shared_base| -// GFX9: v_ceil_f64_e64 v[5:6], |src_shared_base| ; encoding: [0x05,0x01,0x58,0xd1,0xeb,0x00,0x00,0x00] // GFX11: v_ceil_f64_e64 v[5:6], |src_shared_base| ; encoding: [0x05,0x01,0x98,0xd5,0xeb,0x00,0x00,0x00] // GFX12: v_ceil_f64_e64 v[5:6], |src_shared_base| ; encoding: [0x05,0x01,0x98,0xd5,0xeb,0x00,0x00,0x00] -// NOSI: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOCI: :[[@LINE-5]]:21: error: src_shared_base register not available on this GPU -// NOVI: :[[@LINE-6]]:21: error: src_shared_base register not available on this GPU -// NOGFX1250: :[[@LINE-7]]:12: error: invalid operand for instruction +// GFX9: v_ceil_f64_e64 v[5:6], |src_shared_base| ; encoding: [0x05,0x01,0x58,0xd1,0xeb,0x00,0x00,0x00] +// NOCI: :[[@LINE-4]]:21: error: src_shared_base register not available on this GPU +// NOGFX1250: :[[@LINE-5]]:12: error: invalid operand for instruction +// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-7]]:21: error: src_shared_base register not available on this GPU // NOCIVI: :[[@LINE-5]]:21: error: src_shared_base register not available on this GPU v_ceil_f64 v[5:6], -src_shared_base -// GFX9: v_ceil_f64_e64 v[5:6], -src_shared_base ; encoding: [0x05,0x00,0x58,0xd1,0xeb,0x00,0x00,0x20] // GFX11: v_ceil_f64_e64 v[5:6], -src_shared_base ; encoding: [0x05,0x00,0x98,0xd5,0xeb,0x00,0x00,0x20] // GFX12: v_ceil_f64_e64 v[5:6], -src_shared_base ; encoding: [0x05,0x00,0x98,0xd5,0xeb,0x00,0x00,0x20] -// NOSI: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOCI: :[[@LINE-5]]:21: error: src_shared_base register not available on this GPU -// NOVI: :[[@LINE-6]]:21: error: src_shared_base register not available on this GPU -// NOGFX1250: :[[@LINE-7]]:12: error: invalid operand for instruction +// GFX9: v_ceil_f64_e64 v[5:6], -src_shared_base ; encoding: [0x05,0x00,0x58,0xd1,0xeb,0x00,0x00,0x20] +// NOCI: :[[@LINE-4]]:21: error: src_shared_base register not available on this GPU +// NOGFX1250: :[[@LINE-5]]:12: error: invalid operand for instruction +// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-7]]:21: error: src_shared_base register not available on this GPU // NOCIVI: :[[@LINE-5]]:21: error: src_shared_base register not available on this GPU v_ceil_f32 v0, -src_shared_base +// GFX11: v_ceil_f32_e64 v0, -src_shared_base ; encoding: [0x00,0x00,0xa2,0xd5,0xeb,0x00,0x00,0x20] // GFX12XX: v_ceil_f32_e64 v0, -src_shared_base ; encoding: [0x00,0x00,0xa2,0xd5,0xeb,0x00,0x00,0x20] -// NOSICI: :[[@LINE-2]]:17: error: src_shared_base register not available on this GPU // GFX9: v_ceil_f32_e64 v0, -src_shared_base ; encoding: [0x00,0x00,0x5d,0xd1,0xeb,0x00,0x00,0x20] -// GFX11: v_ceil_f32_e64 v0, -src_shared_base ; encoding: [0x00,0x00,0xa2,0xd5,0xeb,0x00,0x00,0x20] +// NOSICI: :[[@LINE-4]]:17: error: src_shared_base register not available on this GPU // NOVI: :[[@LINE-5]]:17: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:17: error: src_shared_base register not available on this GPU v_ceil_f32 v0, |src_shared_base| +// GFX11: v_ceil_f32_e64 v0, |src_shared_base| ; encoding: [0x00,0x01,0xa2,0xd5,0xeb,0x00,0x00,0x00] // GFX12XX: v_ceil_f32_e64 v0, |src_shared_base| ; encoding: [0x00,0x01,0xa2,0xd5,0xeb,0x00,0x00,0x00] -// NOSICI: :[[@LINE-2]]:17: error: src_shared_base register not available on this GPU // GFX9: v_ceil_f32_e64 v0, |src_shared_base| ; encoding: [0x00,0x01,0x5d,0xd1,0xeb,0x00,0x00,0x00] -// GFX11: v_ceil_f32_e64 v0, |src_shared_base| ; encoding: [0x00,0x01,0xa2,0xd5,0xeb,0x00,0x00,0x00] +// NOSICI: :[[@LINE-4]]:17: error: src_shared_base register not available on this GPU // NOVI: :[[@LINE-5]]:17: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:17: error: src_shared_base register not available on this GPU v_ceil_f16_sdwa v5, |src_shared_base| dst_sel:DWORD dst_unused:UNUSED_PRESERVE -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX9: v_ceil_f16_sdwa v5, |src_shared_base| dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x8a,0x0a,0x7e,0xeb,0x16,0xa6,0x00] -// NOVI: :[[@LINE-3]]:22: error: src_shared_base register not available on this GPU -// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported -// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported -// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported +// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported +// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported +// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-6]]:22: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_ceil_f16_sdwa v5, -src_shared_base dst_sel:DWORD dst_unused:UNUSED_PRESERVE -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX9: v_ceil_f16_sdwa v5, -src_shared_base dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x8a,0x0a,0x7e,0xeb,0x16,0x96,0x00] -// NOVI: :[[@LINE-3]]:22: error: src_shared_base register not available on this GPU -// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported -// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported -// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported +// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported +// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported +// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-6]]:22: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_ceil_f32_sdwa v5, src_shared_base dst_sel:DWORD src0_sel:DWORD -// NOSICI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported // GFX9: v_ceil_f32_sdwa v5, src_shared_base dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x3a,0x0a,0x7e,0xeb,0x16,0x86,0x00] -// NOVI: :[[@LINE-3]]:21: error: src_shared_base register not available on this GPU -// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported -// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported -// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported +// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported +// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported +// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported +// NOSICI: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported +// NOVI: :[[@LINE-6]]:21: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported v_ceil_f32_sdwa v5, |src_shared_base| dst_sel:DWORD src0_sel:DWORD -// NOSICI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported // GFX9: v_ceil_f32_sdwa v5, |src_shared_base| dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x3a,0x0a,0x7e,0xeb,0x16,0xa6,0x00] -// NOVI: :[[@LINE-3]]:22: error: src_shared_base register not available on this GPU -// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported -// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported -// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported +// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported +// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported +// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported +// NOSICI: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported +// NOVI: :[[@LINE-6]]:22: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported //---------------------------------------------------------------------------// @@ -1584,206 +1584,206 @@ v_ceil_f32_sdwa v5, |src_shared_base| dst_sel:DWORD src0_sel:DWORD //---------------------------------------------------------------------------// v_add_u32 v0, private_base, s0 -// GFX12XX: v_add_nc_u32_e64 v0, src_private_base, s0 ; encoding: [0x00,0x00,0x25,0xd5,0xed,0x00,0x00,0x00] -// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_add_nc_u32_e64 v0, src_private_base, s0 ; encoding: [0x00,0x00,0x25,0xd5,0xed,0x00,0x00,0x00] -// NOVI: :[[@LINE-4]]:15: error: src_private_base register not available on this GPU -// NOGFX9: :[[@LINE-5]]:29: error: invalid operand (violates constant bus restrictions) +// GFX12XX: v_add_nc_u32_e64 v0, src_private_base, s0 ; encoding: [0x00,0x00,0x25,0xd5,0xed,0x00,0x00,0x00] +// NOGFX9: :[[@LINE-3]]:29: error: invalid operand (violates constant bus restrictions) +// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-5]]:15: error: src_private_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_add_u32 v0, scc, s0 -// GFX12XX: v_add_nc_u32_e64 v0, src_scc, s0 ; encoding: [0x00,0x00,0x25,0xd5,0xfd,0x00,0x00,0x00] -// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_add_nc_u32_e64 v0, src_scc, s0 ; encoding: [0x00,0x00,0x25,0xd5,0xfd,0x00,0x00,0x00] -// NOVI: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode -// NOGFX9: :[[@LINE-5]]:20: error: invalid operand (violates constant bus restrictions) +// GFX12XX: v_add_nc_u32_e64 v0, src_scc, s0 ; encoding: [0x00,0x00,0x25,0xd5,0xfd,0x00,0x00,0x00] +// NOGFX9: :[[@LINE-3]]:20: error: invalid operand (violates constant bus restrictions) +// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU // v_div_fmas implicitly reads VCC v_div_fmas_f32 v0, shared_base, v0, v1 -// GFX12XX: v_div_fmas_f32 v0, src_shared_base, v0, v1 ; encoding: [0x00,0x00,0x37,0xd6,0xeb,0x00,0x06,0x04] -// NOSICI: :[[@LINE-2]]:20: error: src_shared_base register not available on this GPU // GFX11: v_div_fmas_f32 v0, src_shared_base, v0, v1 ; encoding: [0x00,0x00,0x37,0xd6,0xeb,0x00,0x06,0x04] -// NOVI: :[[@LINE-4]]:20: error: src_shared_base register not available on this GPU -// NOGFX9: :[[@LINE-5]]:20: error: invalid operand (violates constant bus restrictions) +// GFX12XX: v_div_fmas_f32 v0, src_shared_base, v0, v1 ; encoding: [0x00,0x00,0x37,0xd6,0xeb,0x00,0x06,0x04] +// NOGFX9: :[[@LINE-3]]:20: error: invalid operand (violates constant bus restrictions) +// NOSICI: :[[@LINE-4]]:20: error: src_shared_base register not available on this GPU +// NOVI: :[[@LINE-5]]:20: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:20: error: src_shared_base register not available on this GPU // v_div_fmas implicitly reads VCC v_div_fmas_f32 v0, v0, shared_limit, v1 -// GFX12XX: v_div_fmas_f32 v0, v0, src_shared_limit, v1 ; encoding: [0x00,0x00,0x37,0xd6,0x00,0xd9,0x05,0x04] -// NOSICI: :[[@LINE-2]]:24: error: src_shared_limit register not available on this GPU // GFX11: v_div_fmas_f32 v0, v0, src_shared_limit, v1 ; encoding: [0x00,0x00,0x37,0xd6,0x00,0xd9,0x05,0x04] -// NOVI: :[[@LINE-4]]:24: error: src_shared_limit register not available on this GPU -// NOGFX9: :[[@LINE-5]]:24: error: invalid operand (violates constant bus restrictions) +// GFX12XX: v_div_fmas_f32 v0, v0, src_shared_limit, v1 ; encoding: [0x00,0x00,0x37,0xd6,0x00,0xd9,0x05,0x04] +// NOGFX9: :[[@LINE-3]]:24: error: invalid operand (violates constant bus restrictions) +// NOSICI: :[[@LINE-4]]:24: error: src_shared_limit register not available on this GPU +// NOVI: :[[@LINE-5]]:24: error: src_shared_limit register not available on this GPU // NOSICIVI: :[[@LINE-1]]:24: error: src_shared_limit register not available on this GPU // v_div_fmas implicitly reads VCC v_div_fmas_f32 v0, v0, v1, private_limit -// GFX12XX: v_div_fmas_f32 v0, v0, v1, src_private_limit ; encoding: [0x00,0x00,0x37,0xd6,0x00,0x03,0xba,0x03] -// NOSICI: :[[@LINE-2]]:28: error: src_private_limit register not available on this GPU // GFX11: v_div_fmas_f32 v0, v0, v1, src_private_limit ; encoding: [0x00,0x00,0x37,0xd6,0x00,0x03,0xba,0x03] -// NOVI: :[[@LINE-4]]:28: error: src_private_limit register not available on this GPU -// NOGFX9: :[[@LINE-5]]:28: error: invalid operand (violates constant bus restrictions) +// GFX12XX: v_div_fmas_f32 v0, v0, v1, src_private_limit ; encoding: [0x00,0x00,0x37,0xd6,0x00,0x03,0xba,0x03] +// NOGFX9: :[[@LINE-3]]:28: error: invalid operand (violates constant bus restrictions) +// NOSICI: :[[@LINE-4]]:28: error: src_private_limit register not available on this GPU +// NOVI: :[[@LINE-5]]:28: error: src_private_limit register not available on this GPU // NOSICIVI: :[[@LINE-1]]:28: error: src_private_limit register not available on this GPU // v_div_fmas implicitly reads VCC v_div_fmas_f32 v0, execz, v0, v1 -// NOSICI: :[[@LINE-1]]:20: error: invalid operand (violates constant bus restrictions) -// NOGFX89: :[[@LINE-2]]:20: error: invalid operand (violates constant bus restrictions) -// NOGFX11: :[[@LINE-3]]:20: error: src_execz register not available on this GPU -// NOGFX12: :[[@LINE-4]]:20: error: src_execz register not available on this GPU -// NOGFX1250: :[[@LINE-5]]:20: error: src_execz register not available on this GPU +// NOGFX11: :[[@LINE-1]]:20: error: src_execz register not available on this GPU +// NOGFX12: :[[@LINE-2]]:20: error: src_execz register not available on this GPU +// NOGFX1250: :[[@LINE-3]]:20: error: src_execz register not available on this GPU +// NOGFX89: :[[@LINE-4]]:20: error: invalid operand (violates constant bus restrictions) +// NOSICI: :[[@LINE-5]]:20: error: invalid operand (violates constant bus restrictions) // NOSICIVI: :[[@LINE-1]]:20: error: invalid operand (violates constant bus restrictions) // v_div_fmas implicitly reads VCC v_div_fmas_f32 v0, v0, scc, v1 +// GFX11: v_div_fmas_f32 v0, v0, src_scc, v1 ; encoding: [0x00,0x00,0x37,0xd6,0x00,0xfb,0x05,0x04] // GFX12XX: v_div_fmas_f32 v0, v0, src_scc, v1 ; encoding: [0x00,0x00,0x37,0xd6,0x00,0xfb,0x05,0x04] -// NOSICI: :[[@LINE-2]]:24: error: invalid operand (violates constant bus restrictions) // NOGFX89: :[[@LINE-3]]:24: error: invalid operand (violates constant bus restrictions) -// GFX11: v_div_fmas_f32 v0, v0, src_scc, v1 ; encoding: [0x00,0x00,0x37,0xd6,0x00,0xfb,0x05,0x04] +// NOSICI: :[[@LINE-4]]:24: error: invalid operand (violates constant bus restrictions) // NOSICIVI: :[[@LINE-1]]:24: error: invalid operand (violates constant bus restrictions) // v_div_fmas implicitly reads VCC v_div_fmas_f32 v0, v0, v1, vccz -// NOSICI: :[[@LINE-1]]:28: error: invalid operand (violates constant bus restrictions) -// NOGFX89: :[[@LINE-2]]:28: error: invalid operand (violates constant bus restrictions) -// NOGFX11: :[[@LINE-3]]:28: error: src_vccz register not available on this GPU -// NOGFX12: :[[@LINE-4]]:28: error: src_vccz register not available on this GPU -// NOGFX1250: :[[@LINE-5]]:28: error: src_vccz register not available on this GPU +// NOGFX11: :[[@LINE-1]]:28: error: src_vccz register not available on this GPU +// NOGFX12: :[[@LINE-2]]:28: error: src_vccz register not available on this GPU +// NOGFX1250: :[[@LINE-3]]:28: error: src_vccz register not available on this GPU +// NOGFX89: :[[@LINE-4]]:28: error: invalid operand (violates constant bus restrictions) +// NOSICI: :[[@LINE-5]]:28: error: invalid operand (violates constant bus restrictions) // NOSICIVI: :[[@LINE-1]]:28: error: invalid operand (violates constant bus restrictions) // v_addc_co_u32 implicitly reads VCC (VOP2) v_addc_co_u32 v0, vcc, shared_base, v0, vcc -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOVI: :[[@LINE-2]]:1: error: instruction not supported on this GPU -// NOGFX9: :[[@LINE-3]]:24: error: invalid operand (violates constant bus restrictions) -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX9: :[[@LINE-4]]:24: error: invalid operand (violates constant bus restrictions) +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-6]]:1: error: instruction not supported on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_madak_f32 v0, shared_base, v0, 0x11213141 -// NOSICI: :[[@LINE-1]]:17: error: src_shared_base register not available on this GPU -// NOVI: :[[@LINE-2]]:17: error: src_shared_base register not available on this GPU -// NOGFX9: :[[@LINE-3]]:17: error: invalid operand (violates constant bus restrictions) -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX9: :[[@LINE-4]]:17: error: invalid operand (violates constant bus restrictions) +// NOSICI: :[[@LINE-5]]:17: error: src_shared_base register not available on this GPU +// NOVI: :[[@LINE-6]]:17: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:17: error: src_shared_base register not available on this GPU v_madak_f32 v0, scc, v0, 0x11213141 -// NOSICI: :[[@LINE-1]]:17: error: invalid operand (violates constant bus restrictions) -// NOGFX89: :[[@LINE-2]]:17: error: invalid operand (violates constant bus restrictions) -// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:17: error: invalid operand (violates constant bus restrictions) +// NOSICI: :[[@LINE-5]]:17: error: invalid operand (violates constant bus restrictions) // NOSICIVI: :[[@LINE-1]]:17: error: invalid operand (violates constant bus restrictions) v_madak_f32 v0, 0xff32ff, v0, 0x11213141 -// NOSICI: :[[@LINE-1]]:31: error: only one unique literal operand is allowed -// NOGFX89: :[[@LINE-2]]:31: error: only one unique literal operand is allowed -// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:31: error: only one unique literal operand is allowed +// NOSICI: :[[@LINE-5]]:31: error: only one unique literal operand is allowed // NOSICIVI: :[[@LINE-1]]:31: error: only one unique literal operand is allowed v_madak_f32 v0, 0xff32ff, v0, 1 -// NOSICI: :[[@LINE-1]]:31: error: only one unique literal operand is allowed -// NOGFX89: :[[@LINE-2]]:31: error: only one unique literal operand is allowed -// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:31: error: only one unique literal operand is allowed +// NOSICI: :[[@LINE-5]]:31: error: only one unique literal operand is allowed // NOSICIVI: :[[@LINE-1]]:31: error: only one unique literal operand is allowed v_madmk_f32 v0, 0xff32ff, 0x11213141, v0 -// NOSICI: :[[@LINE-1]]:27: error: only one unique literal operand is allowed -// NOGFX89: :[[@LINE-2]]:27: error: only one unique literal operand is allowed -// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:27: error: only one unique literal operand is allowed +// NOSICI: :[[@LINE-5]]:27: error: only one unique literal operand is allowed // NOSICIVI: :[[@LINE-1]]:27: error: only one unique literal operand is allowed v_madmk_f32 v0, 0xff32ff, -1, v0 -// NOSICI: :[[@LINE-1]]:27: error: only one unique literal operand is allowed -// NOGFX89: :[[@LINE-2]]:27: error: only one unique literal operand is allowed -// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:27: error: only one unique literal operand is allowed +// NOSICI: :[[@LINE-5]]:27: error: only one unique literal operand is allowed // NOSICIVI: :[[@LINE-1]]:27: error: only one unique literal operand is allowed v_madak_f16 v0, 0xff32, v0, 0x1122 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:29: error: only one unique literal operand is allowed -// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:29: error: only one unique literal operand is allowed +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_madak_f16 v0, 0xff32, v0, 0 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:29: error: only one unique literal operand is allowed -// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:29: error: only one unique literal operand is allowed +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_madmk_f16 v0, 0xff32, 0x1122, v0 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:25: error: only one unique literal operand is allowed -// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:25: error: only one unique literal operand is allowed +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_madmk_f16 v0, 0xff32, 1, v0 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:25: error: only one unique literal operand is allowed -// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:25: error: only one unique literal operand is allowed +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_cmp_eq_f32 s[0:1], private_base, private_limit -// NOSICI: :[[@LINE-1]]:22: error: src_private_base register not available on this GPU -// NOVI: :[[@LINE-2]]:22: error: src_private_base register not available on this GPU -// NOGFX9: :[[@LINE-3]]:36: error: invalid operand (violates constant bus restrictions) -// NOGFX11: :[[@LINE-4]]:14: error: invalid operand for instruction -// NOGFX12: :[[@LINE-5]]:14: error: invalid operand for instruction -// NOGFX1250: :[[@LINE-6]]:14: error: invalid operand for instruction +// NOGFX11: :[[@LINE-1]]:14: error: invalid operand for instruction +// NOGFX12: :[[@LINE-2]]:14: error: invalid operand for instruction +// NOGFX1250: :[[@LINE-3]]:14: error: invalid operand for instruction +// NOGFX9: :[[@LINE-4]]:36: error: invalid operand (violates constant bus restrictions) +// NOSICI: :[[@LINE-5]]:22: error: src_private_base register not available on this GPU +// NOVI: :[[@LINE-6]]:22: error: src_private_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:22: error: src_private_base register not available on this GPU v_cmp_eq_f32 s[0:1], private_base, s0 -// NOSICI: :[[@LINE-1]]:22: error: src_private_base register not available on this GPU -// NOVI: :[[@LINE-2]]:22: error: src_private_base register not available on this GPU -// NOGFX9: :[[@LINE-3]]:36: error: invalid operand (violates constant bus restrictions) -// NOGFX11: :[[@LINE-4]]:14: error: invalid operand for instruction -// NOGFX12: :[[@LINE-5]]:14: error: invalid operand for instruction -// NOGFX1250: :[[@LINE-6]]:14: error: invalid operand for instruction +// NOGFX11: :[[@LINE-1]]:14: error: invalid operand for instruction +// NOGFX12: :[[@LINE-2]]:14: error: invalid operand for instruction +// NOGFX1250: :[[@LINE-3]]:14: error: invalid operand for instruction +// NOGFX9: :[[@LINE-4]]:36: error: invalid operand (violates constant bus restrictions) +// NOSICI: :[[@LINE-5]]:22: error: src_private_base register not available on this GPU +// NOVI: :[[@LINE-6]]:22: error: src_private_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:22: error: src_private_base register not available on this GPU v_cmp_eq_f32 s[0:1], execz, s0 -// NOSICI: :[[@LINE-1]]:29: error: invalid operand (violates constant bus restrictions) -// NOGFX89: :[[@LINE-2]]:29: error: invalid operand (violates constant bus restrictions) -// NOGFX11: :[[@LINE-3]]:22: error: src_execz register not available on this GPU -// NOGFX12: :[[@LINE-4]]:22: error: src_execz register not available on this GPU -// NOGFX1250: :[[@LINE-5]]:22: error: src_execz register not available on this GPU +// NOGFX11: :[[@LINE-1]]:22: error: src_execz register not available on this GPU +// NOGFX12: :[[@LINE-2]]:22: error: src_execz register not available on this GPU +// NOGFX1250: :[[@LINE-3]]:22: error: src_execz register not available on this GPU +// NOGFX89: :[[@LINE-4]]:29: error: invalid operand (violates constant bus restrictions) +// NOSICI: :[[@LINE-5]]:29: error: invalid operand (violates constant bus restrictions) // NOSICIVI: :[[@LINE-1]]:29: error: invalid operand (violates constant bus restrictions) v_pk_add_f16 v255, private_base, private_limit -// GFX12XX: v_pk_add_f16 v255, src_private_base, src_private_limit ; encoding: [0xff,0x40,0x0f,0xcc,0xed,0xdc,0x01,0x18] -// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_pk_add_f16 v255, src_private_base, src_private_limit ; encoding: [0xff,0x40,0x0f,0xcc,0xed,0xdc,0x01,0x18] -// NOVI: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX9: :[[@LINE-5]]:34: error: invalid operand (violates constant bus restrictions) +// GFX12XX: v_pk_add_f16 v255, src_private_base, src_private_limit ; encoding: [0xff,0x40,0x0f,0xcc,0xed,0xdc,0x01,0x18] +// NOGFX9: :[[@LINE-3]]:34: error: invalid operand (violates constant bus restrictions) +// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-5]]:1: error: instruction not supported on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_pk_add_f16 v255, vccz, execz -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOVI: :[[@LINE-2]]:1: error: instruction not supported on this GPU -// NOGFX9: :[[@LINE-3]]:26: error: invalid operand (violates constant bus restrictions) -// NOGFX11: :[[@LINE-4]]:20: error: src_vccz register not available on this GPU -// NOGFX12: :[[@LINE-5]]:20: error: src_vccz register not available on this GPU -// NOGFX1250: :[[@LINE-6]]:20: error: src_vccz register not available on this GPU +// NOGFX11: :[[@LINE-1]]:20: error: src_vccz register not available on this GPU +// NOGFX12: :[[@LINE-2]]:20: error: src_vccz register not available on this GPU +// NOGFX1250: :[[@LINE-3]]:20: error: src_vccz register not available on this GPU +// NOGFX9: :[[@LINE-4]]:26: error: invalid operand (violates constant bus restrictions) +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-6]]:1: error: instruction not supported on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU //---------------------------------------------------------------------------// @@ -1791,36 +1791,36 @@ v_pk_add_f16 v255, vccz, execz //---------------------------------------------------------------------------// v_sqrt_f32 v2, lit(123) -// SICI: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00] -// GFX89: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x4e,0x04,0x7e,0x7b,0x00,0x00,0x00] -// GFX12XX: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00] // GFX11: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00] +// GFX12XX: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00] +// GFX89: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x4e,0x04,0x7e,0x7b,0x00,0x00,0x00] +// SICI: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00] v_sqrt_f32 v2, abs(lit(123)) -// SICI: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00] -// GFX89: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x4e,0x04,0x7e,0x7b,0x00,0x00,0x00] -// GFX12XX: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00] // GFX11: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00] +// GFX12XX: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00] +// GFX89: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x4e,0x04,0x7e,0x7b,0x00,0x00,0x00] +// SICI: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00] v_sqrt_f32 v2, lit(123.0) -// SICI: v_sqrt_f32_e32 v2, lit(0x42f60000) ; encoding: [0xff,0x66,0x04,0x7e,0x00,0x00,0xf6,0x42] -// GFX89: v_sqrt_f32_e32 v2, lit(0x42f60000) ; encoding: [0xff,0x4e,0x04,0x7e,0x00,0x00,0xf6,0x42] -// GFX12XX: v_sqrt_f32_e32 v2, lit(0x42f60000) ; encoding: [0xff,0x66,0x04,0x7e,0x00,0x00,0xf6,0x42] // GFX11: v_sqrt_f32_e32 v2, lit(0x42f60000) ; encoding: [0xff,0x66,0x04,0x7e,0x00,0x00,0xf6,0x42] +// GFX12XX: v_sqrt_f32_e32 v2, lit(0x42f60000) ; encoding: [0xff,0x66,0x04,0x7e,0x00,0x00,0xf6,0x42] +// GFX89: v_sqrt_f32_e32 v2, lit(0x42f60000) ; encoding: [0xff,0x4e,0x04,0x7e,0x00,0x00,0xf6,0x42] +// SICI: v_sqrt_f32_e32 v2, lit(0x42f60000) ; encoding: [0xff,0x66,0x04,0x7e,0x00,0x00,0xf6,0x42] v_sqrt_f64 v[2:3], lit(123.0) -// SICI: v_sqrt_f64_e32 v[2:3], lit(0x405ec000) ; encoding: [0xff,0x68,0x04,0x7e,0x00,0xc0,0x5e,0x40] -// GFX89: v_sqrt_f64_e32 v[2:3], lit(0x405ec000) ; encoding: [0xff,0x50,0x04,0x7e,0x00,0xc0,0x5e,0x40] // GFX11: v_sqrt_f64_e32 v[2:3], lit(0x405ec000) ; encoding: [0xff,0x68,0x04,0x7e,0x00,0xc0,0x5e,0x40] // GFX12: v_sqrt_f64_e32 v[2:3], lit(0x405ec000) ; encoding: [0xff,0x68,0x04,0x7e,0x00,0xc0,0x5e,0x40] // GFX1250: v_sqrt_f64_e32 v[2:3], lit(0x405ec000) ; encoding: [0xfe,0x68,0x04,0x7e,0x00,0xc0,0x5e,0x40,0x00,0x00,0x00,0x00] +// GFX89: v_sqrt_f64_e32 v[2:3], lit(0x405ec000) ; encoding: [0xff,0x50,0x04,0x7e,0x00,0xc0,0x5e,0x40] +// SICI: v_sqrt_f64_e32 v[2:3], lit(0x405ec000) ; encoding: [0xff,0x68,0x04,0x7e,0x00,0xc0,0x5e,0x40] v_sqrt_f64 v[2:3], lit(123) -// SICI: v_sqrt_f64_e32 v[2:3], lit(0x7b) ; encoding: [0xff,0x68,0x04,0x7e,0x7b,0x00,0x00,0x00] -// GFX89: v_sqrt_f64_e32 v[2:3], lit(0x7b) ; encoding: [0xff,0x50,0x04,0x7e,0x7b,0x00,0x00,0x00] // GFX11: v_sqrt_f64_e32 v[2:3], lit(0x7b) ; encoding: [0xff,0x68,0x04,0x7e,0x7b,0x00,0x00,0x00] // GFX12: v_sqrt_f64_e32 v[2:3], lit(0x7b) ; encoding: [0xff,0x68,0x04,0x7e,0x7b,0x00,0x00,0x00] // GFX1250: v_sqrt_f64_e32 v[2:3], lit(0x7b) ; encoding: [0xfe,0x68,0x04,0x7e,0x7b,0x00,0x00,0x00,0x00,0x00,0x00,0x00] +// GFX89: v_sqrt_f64_e32 v[2:3], lit(0x7b) ; encoding: [0xff,0x50,0x04,0x7e,0x7b,0x00,0x00,0x00] +// SICI: v_sqrt_f64_e32 v[2:3], lit(0x7b) ; encoding: [0xff,0x68,0x04,0x7e,0x7b,0x00,0x00,0x00] v_sqrt_f32 v2, lit 123.0 // NOGCN: :[[@LINE-1]]:20: error: expected left paren after lit @@ -1834,16 +1834,16 @@ v_sqrt_f32 v2, lit(v1) // Make sure lit() is accepted on operands without modifiers. v_madak_f32 v4, lit(0x7e8), v8, lit(0x7e8) -// SICI: v_madak_f32 v4, lit(0x7e8), v8, lit(0x7e8) ; encoding: [0xff,0x10,0x08,0x42,0xe8,0x07,0x00,0x00] // GFX89: v_madak_f32 v4, lit(0x7e8), v8, lit(0x7e8) ; encoding: [0xff,0x10,0x08,0x30,0xe8,0x07,0x00,0x00] -// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// SICI: v_madak_f32 v4, lit(0x7e8), v8, lit(0x7e8) ; encoding: [0xff,0x10,0x08,0x42,0xe8,0x07,0x00,0x00] v_madak_f32 v4, lit(lit(0x7e8)), v8, lit(0x7e8) -// NOSICI: :[[@LINE-1]]:24: error: not a valid operand. -// NOGFX89: :[[@LINE-2]]:24: error: not a valid operand. -// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:24: error: not a valid operand. +// NOSICI: :[[@LINE-5]]:24: error: not a valid operand. // NOSICIVI: :[[@LINE-1]]:24: error: not a valid operand. diff --git a/llvm/test/MC/RISCV/xsfvfexp.s b/llvm/test/MC/RISCV/xsfvfexp.s new file mode 100644 index 0000000..bd6aecd --- /dev/null +++ b/llvm/test/MC/RISCV/xsfvfexp.s @@ -0,0 +1,29 @@ +# RUN: llvm-mc -triple=riscv64 -show-encoding --mattr=+xsfvfexp32e %s \ +# RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +# RUN: not llvm-mc -triple=riscv64 -show-encoding %s 2>&1 \ +# RUN: | FileCheck %s --check-prefix=CHECK-ERROR +# RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+xsfvfexp32e %s \ +# RUN: | llvm-objdump -d --mattr=+xsfvfexp32e - \ +# RUN: | FileCheck %s --check-prefix=CHECK-INST +# RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+xsfvfexp32e %s \ +# RUN: | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN +# RUN: llvm-mc -triple=riscv64 -show-encoding --mattr=+xsfvfexp16e %s \ +# RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +# RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+xsfvfexp16e %s \ +# RUN: | llvm-objdump -d --mattr=+xsfvfexp16e - \ +# RUN: | FileCheck %s --check-prefix=CHECK-INST +# RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+xsfvfexp16e %s \ +# RUN: | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN +# RUN: llvm-mc -triple=riscv64 -show-encoding --mattr=+zvfbfmin,+xsfvfbfexp16e %s \ +# RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +# RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+zvfbfmin,+xsfvfbfexp16e %s \ +# RUN: | llvm-objdump -d --mattr=+xsfvfbfexp16e - \ +# RUN: | FileCheck %s --check-prefix=CHECK-INST +# RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+zvfbfmin,+xsfvfbfexp16e %s \ +# RUN: | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN + +sf.vfexp.v v2, v5, v0.t +# CHECK-INST: sf.vfexp.v v2, v5, v0.t +# CHECK-ENCODING: [0x57,0x91,0x53,0x4c] +# CHECK-ERROR: instruction requires the following: 'Xsfvfbfexp16e', 'Xsfvfexp16e', or 'Xsfvfexp32e' (SiFive Vector Floating-Point Exponential Function Instruction){{$}} +# CHECK-UNKNOWN: 4c539157 <unknown> diff --git a/llvm/test/MC/RISCV/xsfvfexpa.s b/llvm/test/MC/RISCV/xsfvfexpa.s new file mode 100644 index 0000000..317a103 --- /dev/null +++ b/llvm/test/MC/RISCV/xsfvfexpa.s @@ -0,0 +1,15 @@ +# RUN: llvm-mc -triple=riscv64 -show-encoding --mattr=+xsfvfexpa %s \ +# RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +# RUN: not llvm-mc -triple=riscv64 -show-encoding %s 2>&1 \ +# RUN: | FileCheck %s --check-prefix=CHECK-ERROR +# RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+xsfvfexpa %s \ +# RUN: | llvm-objdump -d --mattr=+xsfvfexpa - \ +# RUN: | FileCheck %s --check-prefix=CHECK-INST +# RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+xsfvfexpa %s \ +# RUN: | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN + +sf.vfexpa.v v2, v5, v0.t +# CHECK-INST: sf.vfexpa.v v2, v5, v0.t +# CHECK-ENCODING: [0x57,0x11,0x53,0x4c] +# CHECK-ERROR: instruction requires the following: 'Xsfvfexpa' (SiFive Vector Floating-Point Exponential Approximation Instruction){{$}} +# CHECK-UNKNOWN: 4c531157 <unknown> diff --git a/llvm/test/ThinLTO/AArch64/aarch64_inline.ll b/llvm/test/ThinLTO/AArch64/aarch64_inline.ll new file mode 100644 index 0000000..401f66d --- /dev/null +++ b/llvm/test/ThinLTO/AArch64/aarch64_inline.ll @@ -0,0 +1,86 @@ +;; Test verifies inlining happens cross module when module flags are upgraded. +;; `foo` and `main` are both old semantic while bar is the new semantic. +;; Regression test for #82763 + +; RUN: split-file %s %t +; RUN: opt -module-summary %t/foo.ll -o %t/foo.o +; RUN: opt -module-summary %t/bar.ll -o %t/bar.o +; RUN: opt -module-summary %t/main.ll -o %t/main.o +; RUN: llvm-lto2 run %t/main.o %t/foo.o %t/bar.o -save-temps \ +; RUN: -o %t/t.exe \ +; RUN: -r=%t/foo.o,foo,plx \ +; RUN: -r=%t/bar.o,bar,plx \ +; RUN: -r=%t/main.o,foo,l \ +; RUN: -r=%t/main.o,bar,l \ +; RUN: -r=%t/main.o,main,plx 2>&1 +; RUN: llvm-dis %t/t.exe.1.4.opt.bc -o - | FileCheck %s + +; CHECK: define dso_local noundef i32 @main() local_unnamed_addr #0 { +; CHECK-NEXT: entry: +; CHECK-NEXT: ret i32 35 +; CHECK-NEXT: } + +; CHECK: attributes #0 = { {{.*}}"branch-target-enforcement" "sign-return-address"="all" "sign-return-address-key"="b_key" } + +; CHECK: !llvm.module.flags = !{!0, !1, !2, !3} + +; CHECK: !0 = !{i32 8, !"branch-target-enforcement", i32 2} +; CHECK: !1 = !{i32 8, !"sign-return-address", i32 2} +; CHECK: !2 = !{i32 8, !"sign-return-address-all", i32 2} +; CHECK: !3 = !{i32 8, !"sign-return-address-with-bkey", i32 2} + + +;--- foo.ll +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +define dso_local noundef i32 @foo() local_unnamed_addr #0 { +entry: + ret i32 34 +} + +attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) } +!llvm.module.flags = !{!0, !1, !2, !3 } +!0 = !{i32 8, !"branch-target-enforcement", i32 1} +!1 = !{i32 8, !"sign-return-address", i32 1} +!2 = !{i32 8, !"sign-return-address-all", i32 1} +!3 = !{i32 8, !"sign-return-address-with-bkey", i32 1} + +;--- bar.ll +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +define dso_local noundef i32 @bar() local_unnamed_addr #0 { +entry: + ret i32 1 +} + +attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "branch-target-enforcement" "sign-return-address"="all" "sign-return-address-key"="b_key" } +!llvm.module.flags = !{!0, !1, !2, !3 } +!0 = !{i32 8, !"branch-target-enforcement", i32 2} +!1 = !{i32 8, !"sign-return-address", i32 2} +!2 = !{i32 8, !"sign-return-address-all", i32 2} +!3 = !{i32 8, !"sign-return-address-with-bkey", i32 2} + +;--- main.ll +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +declare i32 @foo(); +declare i32 @bar(); + +define i32 @main() #0 { +entry: + %1 = call i32 @foo() + %2 = call i32 @bar() + %3 = add i32 %1, %2 + ret i32 %3 +} + +attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) } + +!llvm.module.flags = !{!0, !1, !2, !3 } +!0 = !{i32 8, !"branch-target-enforcement", i32 1} +!1 = !{i32 8, !"sign-return-address", i32 1} +!2 = !{i32 8, !"sign-return-address-all", i32 1} +!3 = !{i32 8, !"sign-return-address-with-bkey", i32 1} diff --git a/llvm/test/Transforms/DeadStoreElimination/matrix-intrinsics.ll b/llvm/test/Transforms/DeadStoreElimination/matrix-intrinsics.ll new file mode 100644 index 0000000..2eaa275 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/matrix-intrinsics.ll @@ -0,0 +1,332 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes=dse -S %s | FileCheck %s + +define void @dead_unstrided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @dead_unstrided_store_non_matrix_load( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[L:%.*]] = load double, ptr [[SRC]], align 8 +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2) + %l = load double, ptr %src + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2) + ret void +} + +define void @live_unstrided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @live_unstrided_store_non_matrix_load( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[L_1:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L_2:%.*]] = load double, ptr [[DST]], align 8 +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L_1]], ptr [[DST]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + %l.1 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2) + %l.2 = load double, ptr %dst + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l.1, ptr %dst, i32 4, i1 false, i32 4, i32 2) + ret void +} + +define void @dead_strided_store(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @dead_strided_store( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 100, i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 200, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 100, i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 100, i1 false, i32 4, i32 2) + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 200, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 100, i1 false, i32 4, i32 2) + ret void +} + +define void @live_strided_store(ptr %ptr) { +; CHECK-LABEL: define void @live_strided_store( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[PTR]], i32 100, i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[PTR]], i32 200, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[PTR]], i32 100, i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %ptr, i32 100, i1 false, i32 4, i32 2) + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %ptr, i32 200, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %ptr, i32 100, i1 false, i32 4, i32 2) + ret void +} + +define void @dead_strided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @dead_strided_store_non_matrix_load( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[L_1:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 100, i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L:%.*]] = load double, ptr [[SRC]], align 8 +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L_1]], ptr [[DST]], i32 100, i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + %l.1 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 100, i1 false, i32 4, i32 2) + %l.2 = load double, ptr %src + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l.1, ptr %dst, i32 100, i1 false, i32 4, i32 2) + ret void +} + +define void @live_strided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @live_strided_store_non_matrix_load( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[L_1:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 100, i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L_2:%.*]] = load double, ptr [[DST]], align 8 +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L_1]], ptr [[DST]], i32 100, i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + %l.1 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 100, i1 false, i32 4, i32 2) + %l.2 = load double, ptr %dst + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l.1, ptr %dst, i32 100, i1 false, i32 4, i32 2) + ret void +} + +define void @dead_dynamically_strided_store(ptr noalias %src, ptr noalias %dst, i32 %stride) { +; CHECK-LABEL: define void @dead_dynamically_strided_store( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[STRIDE:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 [[STRIDE]], i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 [[STRIDE]], i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 %stride, i1 false, i32 4, i32 2) + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 %stride, i1 false, i32 4, i32 2) + ret void +} + +define void @live_dynamically_strided_store(ptr %ptr, i32 %stride) { +; CHECK-LABEL: define void @live_dynamically_strided_store( +; CHECK-SAME: ptr [[PTR:%.*]], i32 [[STRIDE:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[PTR]], i32 [[STRIDE]], i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[PTR]], i32 [[STRIDE]], i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[PTR]], i32 [[STRIDE]], i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %ptr, i32 %stride, i1 false, i32 4, i32 2) + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %ptr, i32 %stride, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %ptr, i32 %stride, i1 false, i32 4, i32 2) + ret void +} + +define void @dead_dynamically_strided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst, i32 %stride) { +; CHECK-LABEL: define void @dead_dynamically_strided_store_non_matrix_load( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[STRIDE:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[L_1:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 [[STRIDE]], i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L:%.*]] = load double, ptr [[SRC]], align 8 +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L_1]], ptr [[DST]], i32 [[STRIDE]], i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + %l.1 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 %stride, i1 false, i32 4, i32 2) + %l.2 = load double, ptr %src + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l.1, ptr %dst, i32 %stride, i1 false, i32 4, i32 2) + ret void +} + +define void @live_dynamically_strided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst, i32 %stride) { +; CHECK-LABEL: define void @live_dynamically_strided_store_non_matrix_load( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[STRIDE:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[L_1:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 [[STRIDE]], i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L_2:%.*]] = load double, ptr [[DST]], align 8 +; CHECK-NEXT: ret void +; +entry: + %l.1 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 %stride, i1 false, i32 4, i32 2) + %l.2 = load double, ptr %dst + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 %stride, i1 false, i32 4, i32 2) + ret void +} + +define void @dead_unstrided_store(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @dead_unstrided_store( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2) + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 4, i1 false, i32 4, i32 2) + ret void +} + +define void @live_unstrided_store(ptr %ptr) { +; CHECK-LABEL: define void @live_unstrided_store( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[PTR]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[PTR]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[PTR]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %ptr, i32 4, i1 false, i32 4, i32 2) + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %ptr, i32 4, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %ptr, i32 4, i1 false, i32 4, i32 2) + ret void +} + +define void @dead_non_matrix_store(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @dead_non_matrix_store( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[DST_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 6 +; CHECK-NEXT: store double 4.200000e+01, ptr [[DST_OFFSET]], align 8 +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + %dst.offset = getelementptr inbounds double, ptr %src, i32 6 + store double 42.0, ptr %dst.offset + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 4, i1 false, i32 4, i32 2) + ret void +} + +define void @live_non_matrix_store(ptr %ptr) { +; CHECK-LABEL: define void @live_non_matrix_store( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[PTR_OFFSET:%.*]] = getelementptr inbounds double, ptr [[PTR]], i32 6 +; CHECK-NEXT: store double 4.200000e+01, ptr [[PTR_OFFSET]], align 8 +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[PTR]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[PTR]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + %ptr.offset = getelementptr inbounds double, ptr %ptr, i32 6 + store double 42.0, ptr %ptr.offset + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %ptr, i32 4, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %ptr, i32 4, i1 false, i32 4, i32 2) + ret void +} + +define void @dead_matrix_store_non_matrix_overwrite_unstrided(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @dead_matrix_store_non_matrix_overwrite_unstrided( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: store <8 x double> zeroinitializer, ptr [[DST]], align 64 +; CHECK-NEXT: ret void +; +entry: + call void @llvm.matrix.column.major.store(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2) + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2) + store <8 x double> zeroinitializer, ptr %dst + ret void +} + +define void @dead_matrix_store_non_matrix_overwrite_strided(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @dead_matrix_store_non_matrix_overwrite_strided( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 8, i1 false, i32 4, i32 2) +; CHECK-NEXT: store <16 x double> zeroinitializer, ptr [[DST]], align 128 +; CHECK-NEXT: ret void +; +entry: + call void @llvm.matrix.column.major.store(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2) + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 8, i1 false, i32 4, i32 2) + store <16 x double> zeroinitializer, ptr %dst + ret void +} + +define void @live_matrix_store_non_matrix_overwrite_unstrided(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @live_matrix_store_non_matrix_overwrite_unstrided( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: store <4 x double> zeroinitializer, ptr [[DST]], align 32 +; CHECK-NEXT: ret void +; +entry: + call void @llvm.matrix.column.major.store(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2) + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2) + store <4 x double> zeroinitializer, ptr %dst + ret void +} + +define void @live_matrix_store_non_matrix_overwrite_strided(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @live_matrix_store_non_matrix_overwrite_strided( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 8, i1 false, i32 4, i32 2) +; CHECK-NEXT: store <8 x double> zeroinitializer, ptr [[DST]], align 64 +; CHECK-NEXT: ret void +; +entry: + call void @llvm.matrix.column.major.store(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2) + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 8, i1 false, i32 4, i32 2) + store <8 x double> zeroinitializer, ptr %dst + ret void +} + +define void @dead_matrix_store_dimension_change(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @dead_matrix_store_dimension_change( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v9f64.i32(<9 x double> zeroinitializer, ptr [[DST]], i32 3, i1 false, i32 3, i32 3) +; CHECK-NEXT: ret void +; +entry: + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 8, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 4, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v9f64.i32(<9 x double> zeroinitializer, ptr %dst, i32 3, i1 false, i32 3, i32 3) + ret void +} + +define void @live_matrix_store_dimension_change(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @live_matrix_store_dimension_change( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 8, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v9f64.i32(<9 x double> zeroinitializer, ptr [[DST]], i32 3, i1 false, i32 3, i32 3) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 8, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v9f64.i32(<9 x double> zeroinitializer, ptr %dst, i32 3, i1 false, i32 3, i32 3) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 4, i1 false, i32 4, i32 2) + ret void +} + +declare <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr, i32, i1, i32, i32) +declare <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr, i32, i1, i32, i32) +declare void @llvm.matrix.column.major.store.v8f64.i32(<8 x double>, ptr, i32, i1, i32, i32) diff --git a/llvm/test/Transforms/GVN/matrix-intrinsics.ll b/llvm/test/Transforms/GVN/matrix-intrinsics.ll new file mode 100644 index 0000000..03bd45b --- /dev/null +++ b/llvm/test/Transforms/GVN/matrix-intrinsics.ll @@ -0,0 +1,132 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes=gvn -S %s | FileCheck %s + +define void @redundant_unstrided_load(ptr %src) { +; CHECK-LABEL: define void @redundant_unstrided_load( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 8 +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @use(<8 x double> [[L]]) +; CHECK-NEXT: call void @use(<8 x double> [[L]]) +; CHECK-NEXT: ret void +; +entry: + %src.offset = getelementptr inbounds double, ptr %src, i32 8 + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 4, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64(<8 x double> %l, ptr %src, i32 4, i1 false, i32 4, i32 2) + %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 4, i1 false, i32 4, i32 2) + call void @use(<8 x double> %l) + call void @use(<8 x double> %l.2) + ret void +} + +define void @redundant_unstrided_load_non_matrix_store(ptr %src) { +; CHECK-LABEL: define void @redundant_unstrided_load_non_matrix_store( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 1 +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: store double 4.200000e+01, ptr [[SRC]], align 8 +; CHECK-NEXT: call void @use(<8 x double> [[L]]) +; CHECK-NEXT: call void @use(<8 x double> [[L]]) +; CHECK-NEXT: ret void +; +entry: + %src.offset = getelementptr inbounds double, ptr %src, i32 1 + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 4, i1 false, i32 4, i32 2) + store double 42.0, ptr %src + %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 4, i1 false, i32 4, i32 2) + call void @use(<8 x double> %l) + call void @use(<8 x double> %l.2) + ret void +} + +define void @redundant_strided_load(ptr %src) { +; CHECK-LABEL: define void @redundant_strided_load( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 16 +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 8, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[SRC]], i32 8, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @use(<8 x double> [[L]]) +; CHECK-NEXT: call void @use(<8 x double> [[L]]) +; CHECK-NEXT: ret void +; +entry: + %src.offset = getelementptr inbounds double, ptr %src, i32 16 + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2) + %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) + call void @use(<8 x double> %l) + call void @use(<8 x double> %l.2) + ret void + +} + +define void @redundant_strided_load_non_matrix_store(ptr %src) { +; CHECK-LABEL: define void @redundant_strided_load_non_matrix_store( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 16 +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 8, i1 false, i32 4, i32 2) +; CHECK-NEXT: store double 4.200000e+01, ptr [[SRC]], align 8 +; CHECK-NEXT: call void @use(<8 x double> [[L]]) +; CHECK-NEXT: call void @use(<8 x double> [[L]]) +; CHECK-NEXT: ret void +; +entry: + %src.offset = getelementptr inbounds double, ptr %src, i32 16 + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) + store double 42.0, ptr %src + %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) + call void @use(<8 x double> %l) + call void @use(<8 x double> %l.2) + ret void +} + +define void @repeat_load_dimension_change_project(ptr %src) { +; CHECK-LABEL: define void @repeat_load_dimension_change_project( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L_2:%.*]] = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr [[SRC]], i32 3, i1 false, i32 3, i32 3) +; CHECK-NEXT: [[L_3:%.*]] = shufflevector <9 x double> [[L_2]], <9 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> +; CHECK-NEXT: call void @use(<8 x double> [[L]]) +; CHECK-NEXT: call void @use(<8 x double> [[L_3]]) +; CHECK-NEXT: ret void +; +entry: + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2) + %l.2 = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr %src, i32 3, i1 false, i32 3, i32 3) + %l.3 = shufflevector <9 x double> %l.2, <9 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + call void @use(<8 x double> %l) + call void @use(<8 x double> %l.3) + ret void +} + +define void @repeat_load_dimension_change_shuffle(ptr %src) { +; CHECK-LABEL: define void @repeat_load_dimension_change_shuffle( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L_2:%.*]] = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr [[SRC]], i32 3, i1 false, i32 3, i32 3) +; CHECK-NEXT: [[L_3:%.*]] = shufflevector <9 x double> [[L_2]], <9 x double> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> +; CHECK-NEXT: call void @use(<8 x double> [[L]]) +; CHECK-NEXT: call void @use(<8 x double> [[L_3]]) +; CHECK-NEXT: ret void +; +entry: + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2) + %l.2 = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr %src, i32 3, i1 false, i32 3, i32 3) + %l.3 = shufflevector <9 x double> %l.2, <9 x double> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + call void @use(<8 x double> %l) + call void @use(<8 x double> %l.3) + ret void +} + +declare <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr, i32, i1, i32, i32) +declare <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr, i32, i1, i32, i32) +declare void @llvm.matrix.column.major.store.v8f64.i32(<8 x double>, ptr, i32, i1, i32, i32) +declare void @use(<8 x double>) diff --git a/llvm/test/Transforms/IROutliner/outlining-compatible-and-attribute-transfer.ll b/llvm/test/Transforms/IROutliner/outlining-compatible-and-attribute-transfer.ll index b3f2e81..15ce3e3 100644 --- a/llvm/test/Transforms/IROutliner/outlining-compatible-and-attribute-transfer.ll +++ b/llvm/test/Transforms/IROutliner/outlining-compatible-and-attribute-transfer.ll @@ -5,7 +5,7 @@ ; attributes that should be transferred only if it is on all of the regions. ; This includes the attributes, no-nans-fp-math, -; no-signed-zeros-fp-math, less-precise-fpmad, unsafe-fp-math, and +; no-signed-zeros-fp-math, less-precise-fpmad, and ; no-infs-fp-math. Only when each instance of similarity has these attributes ; can we say that the outlined function can have these attributes since that ; is the more general case for these attributes. @@ -101,7 +101,7 @@ entry: } attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "less-precise-fpmad"="true" -"unsafe-fp-math"="true" "no-infs-fp-math"="true"} +"no-infs-fp-math"="true"} ; CHECK: define internal void @outlined_ir_func_0(ptr [[ARG0:%.*]], ptr [[ARG1:%.*]], ptr [[ARG2:%.*]]) [[ATTR1:#[0-9]+]] { ; CHECK: entry_to_outline: @@ -122,5 +122,5 @@ attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "les ; CHECK-NEXT: [[CL:%.*]] = load i32, ptr [[ARG2]], align 4 -; CHECK: attributes [[ATTR1]] = { minsize optsize "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "unsafe-fp-math"="false" } -; CHECK: attributes [[ATTR]] = { minsize optsize "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "unsafe-fp-math"="true" } +; CHECK: attributes [[ATTR1]] = { minsize optsize "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" } +; CHECK: attributes [[ATTR]] = { minsize optsize "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" } diff --git a/llvm/test/Transforms/Inline/attributes.ll b/llvm/test/Transforms/Inline/attributes.ll index 55ab430..da7eeda 100644 --- a/llvm/test/Transforms/Inline/attributes.ll +++ b/llvm/test/Transforms/Inline/attributes.ll @@ -601,46 +601,6 @@ define i32 @test_no-signed-zeros-fp-math3(i32 %i) "no-signed-zeros-fp-math"="tru ; CHECK-NEXT: ret i32 } -define i32 @unsafe-fp-math_callee0(i32 %i) "unsafe-fp-math"="false" { - ret i32 %i -; CHECK: @unsafe-fp-math_callee0(i32 %i) [[UNSAFE_FPMATH_FALSE:#[0-9]+]] { -; CHECK-NEXT: ret i32 -} - -define i32 @unsafe-fp-math_callee1(i32 %i) "unsafe-fp-math"="true" { - ret i32 %i -; CHECK: @unsafe-fp-math_callee1(i32 %i) [[UNSAFE_FPMATH_TRUE:#[0-9]+]] { -; CHECK-NEXT: ret i32 -} - -define i32 @test_unsafe-fp-math0(i32 %i) "unsafe-fp-math"="false" { - %1 = call i32 @unsafe-fp-math_callee0(i32 %i) - ret i32 %1 -; CHECK: @test_unsafe-fp-math0(i32 %i) [[UNSAFE_FPMATH_FALSE]] { -; CHECK-NEXT: ret i32 -} - -define i32 @test_unsafe-fp-math1(i32 %i) "unsafe-fp-math"="false" { - %1 = call i32 @unsafe-fp-math_callee1(i32 %i) - ret i32 %1 -; CHECK: @test_unsafe-fp-math1(i32 %i) [[UNSAFE_FPMATH_FALSE]] { -; CHECK-NEXT: ret i32 -} - -define i32 @test_unsafe-fp-math2(i32 %i) "unsafe-fp-math"="true" { - %1 = call i32 @unsafe-fp-math_callee0(i32 %i) - ret i32 %1 -; CHECK: @test_unsafe-fp-math2(i32 %i) [[UNSAFE_FPMATH_FALSE]] { -; CHECK-NEXT: ret i32 -} - -define i32 @test_unsafe-fp-math3(i32 %i) "unsafe-fp-math"="true" { - %1 = call i32 @unsafe-fp-math_callee1(i32 %i) - ret i32 %1 -; CHECK: @test_unsafe-fp-math3(i32 %i) [[UNSAFE_FPMATH_TRUE]] { -; CHECK-NEXT: ret i32 -} - ; Test that fn_ret_thunk_extern has no CompatRule; inlining is permitted. ; Test that fn_ret_thunk_extern has no MergeRule; fn_ret_thunk_extern is not ; propagated or dropped on the caller after inlining. @@ -693,6 +653,4 @@ define i32 @loader_replaceable_caller() { ; CHECK: attributes [[NO_NANS_FPMATH_TRUE]] = { "no-nans-fp-math"="true" } ; CHECK: attributes [[NO_SIGNED_ZEROS_FPMATH_FALSE]] = { "no-signed-zeros-fp-math"="false" } ; CHECK: attributes [[NO_SIGNED_ZEROS_FPMATH_TRUE]] = { "no-signed-zeros-fp-math"="true" } -; CHECK: attributes [[UNSAFE_FPMATH_FALSE]] = { "unsafe-fp-math"="false" } -; CHECK: attributes [[UNSAFE_FPMATH_TRUE]] = { "unsafe-fp-math"="true" } ; CHECK: attributes [[FNRETTHUNK_EXTERN]] = { fn_ret_thunk_extern } diff --git a/llvm/test/Transforms/InstCombine/select_with_identical_phi.ll b/llvm/test/Transforms/InstCombine/select_with_identical_phi.ll deleted file mode 100644 index 7816781..0000000 --- a/llvm/test/Transforms/InstCombine/select_with_identical_phi.ll +++ /dev/null @@ -1,243 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -S -passes=instcombine | FileCheck %s -@A = extern_weak global float, align 4 - -; %same.as.v1 is a select with two phis %v1 and %phi.to.remove as the true -; and false values, while %v1 and %phi.to.remove are actually the same. -; Fold the selection instruction %same.as.v1 to %v1. -define void @select_with_identical_phi(ptr %m, ptr %n, i32 %count) { -; CHECK-LABEL: @select_with_identical_phi( -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[V0:%.*]] = phi float [ 0x4415AF1D80000000, [[ENTRY:%.*]] ], [ [[V0_1:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[Q:%.*]] = phi ptr [ [[M:%.*]], [[ENTRY]] ], [ [[Q_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[C:%.*]] = phi ptr [ [[N:%.*]], [[ENTRY]] ], [ [[C_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[Q_LOAD:%.*]] = load float, ptr [[Q]], align 4 -; CHECK-NEXT: [[C_LOAD:%.*]] = load float, ptr [[C]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fsub float [[Q_LOAD]], [[C_LOAD]] -; CHECK-NEXT: [[CMP1:%.*]] = fcmp olt float [[SUB]], [[V0]] -; CHECK-NEXT: [[V0_1]] = select i1 [[CMP1]], float [[SUB]], float [[V0]] -; CHECK-NEXT: [[CMP2:%.*]] = fcmp ogt float [[SUB]], [[V1]] -; CHECK-NEXT: [[V1_1]] = select i1 [[CMP2]], float [[SUB]], float [[V1]] -; CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I]], 1 -; CHECK-NEXT: [[Q_NEXT]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 4 -; CHECK-NEXT: [[C_NEXT]] = getelementptr inbounds nuw i8, ptr [[C]], i64 4 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], [[COUNT:%.*]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]] -; CHECK: exit: -; CHECK-NEXT: store float [[V1_1]], ptr @A, align 4 -; CHECK-NEXT: ret void -; -entry: - br label %for.body - -for.body: ; preds = %entry, %for.body - %v0 = phi float [ 0x4415AF1D80000000, %entry ], [ %v0.1, %for.body ] - %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ] - %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ] - %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ] - %q = phi ptr [ %m, %entry ], [ %q.next, %for.body ] - %c = phi ptr [ %n, %entry ], [ %c.next, %for.body ] - %q.load = load float, ptr %q - %c.load = load float, ptr %c - %sub = fsub float %q.load, %c.load - %cmp1 = fcmp olt float %sub, %v0 - %v0.1 = select i1 %cmp1, float %sub, float %v0 - %same.as.v1 = select i1 %cmp1, float %v1, float %phi.to.remove - %cmp2 = fcmp ogt float %sub, %same.as.v1 - %v1.1 = select i1 %cmp2, float %sub, float %v1 - %phi.to.remove.next = select i1 %cmp2, float %sub, float %same.as.v1 - %inc.i = add nuw nsw i32 %i, 1 - %q.next = getelementptr inbounds i8, ptr %q, i64 4 - %c.next = getelementptr inbounds i8, ptr %c, i64 4 - %exitcond = icmp eq i32 %inc.i, %count - br i1 %exitcond, label %exit, label %for.body - -exit: - %vl.1.lcssa = phi float [ %v1.1, %for.body ] - store float %vl.1.lcssa, ptr @A - ret void -} - -; The difference from select_with_identical_phi() is that the true and false values in -; %phi.to.remove.next and %v1.1 are swapped. -; Check that %same.as.v1 can be folded. -define void @select_with_identical_phi_2(ptr %m, ptr %n, i32 %count) { -; CHECK-LABEL: @select_with_identical_phi_2( -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[V0:%.*]] = phi float [ 0x4415AF1D80000000, [[ENTRY:%.*]] ], [ [[V0_1:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[Q:%.*]] = phi ptr [ [[M:%.*]], [[ENTRY]] ], [ [[Q_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[C:%.*]] = phi ptr [ [[N:%.*]], [[ENTRY]] ], [ [[C_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[Q_LOAD:%.*]] = load float, ptr [[Q]], align 4 -; CHECK-NEXT: [[C_LOAD:%.*]] = load float, ptr [[C]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fsub float [[Q_LOAD]], [[C_LOAD]] -; CHECK-NEXT: [[CMP1:%.*]] = fcmp olt float [[SUB]], [[V0]] -; CHECK-NEXT: [[V0_1]] = select i1 [[CMP1]], float [[SUB]], float [[V0]] -; CHECK-NEXT: [[CMP2:%.*]] = fcmp ogt float [[SUB]], [[V1]] -; CHECK-NEXT: [[V1_1]] = select i1 [[CMP2]], float [[V1]], float [[SUB]] -; CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I]], 1 -; CHECK-NEXT: [[Q_NEXT]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 4 -; CHECK-NEXT: [[C_NEXT]] = getelementptr inbounds nuw i8, ptr [[C]], i64 4 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], [[COUNT:%.*]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]] -; CHECK: exit: -; CHECK-NEXT: store float [[V1_1]], ptr @A, align 4 -; CHECK-NEXT: ret void -; -entry: - br label %for.body - -for.body: ; preds = %entry, %for.body - %v0 = phi float [ 0x4415AF1D80000000, %entry ], [ %v0.1, %for.body ] - %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ] - %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ] - %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ] - %q = phi ptr [ %m, %entry ], [ %q.next, %for.body ] - %c = phi ptr [ %n, %entry ], [ %c.next, %for.body ] - %q.load = load float, ptr %q - %c.load = load float, ptr %c - %sub = fsub float %q.load, %c.load - %cmp1 = fcmp olt float %sub, %v0 - %v0.1 = select i1 %cmp1, float %sub, float %v0 - %same.as.v1 = select i1 %cmp1, float %v1, float %phi.to.remove - %cmp2 = fcmp ogt float %sub, %same.as.v1 - %v1.1 = select i1 %cmp2, float %v1, float %sub - %phi.to.remove.next = select i1 %cmp2, float %same.as.v1, float %sub - %inc.i = add nuw nsw i32 %i, 1 - %q.next = getelementptr inbounds i8, ptr %q, i64 4 - %c.next = getelementptr inbounds i8, ptr %c, i64 4 - %exitcond = icmp eq i32 %inc.i, %count - br i1 %exitcond, label %exit, label %for.body - -exit: - %vl.1.lcssa = phi float [ %v1.1, %for.body ] - store float %vl.1.lcssa, ptr @A - ret void -} - -; The difference from select_with_identical_phi() is that the true and false values in -; same.as.v1 are swapped. -; Check that %same.as.v1 can be folded. -define void @select_with_identical_phi_3(ptr %m, ptr %n, i32 %count) { -; CHECK-LABEL: @select_with_identical_phi_3( -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[V0:%.*]] = phi float [ 0x4415AF1D80000000, [[ENTRY:%.*]] ], [ [[V0_1:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[Q:%.*]] = phi ptr [ [[M:%.*]], [[ENTRY]] ], [ [[Q_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[C:%.*]] = phi ptr [ [[N:%.*]], [[ENTRY]] ], [ [[C_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[Q_LOAD:%.*]] = load float, ptr [[Q]], align 4 -; CHECK-NEXT: [[C_LOAD:%.*]] = load float, ptr [[C]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fsub float [[Q_LOAD]], [[C_LOAD]] -; CHECK-NEXT: [[CMP1:%.*]] = fcmp olt float [[SUB]], [[V0]] -; CHECK-NEXT: [[V0_1]] = select i1 [[CMP1]], float [[SUB]], float [[V0]] -; CHECK-NEXT: [[CMP2:%.*]] = fcmp ogt float [[SUB]], [[V1]] -; CHECK-NEXT: [[V1_1]] = select i1 [[CMP2]], float [[SUB]], float [[V1]] -; CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I]], 1 -; CHECK-NEXT: [[Q_NEXT]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 4 -; CHECK-NEXT: [[C_NEXT]] = getelementptr inbounds nuw i8, ptr [[C]], i64 4 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], [[COUNT:%.*]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]] -; CHECK: exit: -; CHECK-NEXT: store float [[V1_1]], ptr @A, align 4 -; CHECK-NEXT: ret void -; -entry: - br label %for.body - -for.body: ; preds = %entry, %for.body - %v0 = phi float [ 0x4415AF1D80000000, %entry ], [ %v0.1, %for.body ] - %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ] - %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ] - %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ] - %q = phi ptr [ %m, %entry ], [ %q.next, %for.body ] - %c = phi ptr [ %n, %entry ], [ %c.next, %for.body ] - %q.load = load float, ptr %q - %c.load = load float, ptr %c - %sub = fsub float %q.load, %c.load - %cmp1 = fcmp olt float %sub, %v0 - %v0.1 = select i1 %cmp1, float %sub, float %v0 - %same.as.v1 = select i1 %cmp1, float %phi.to.remove, float %v1 - %cmp2 = fcmp ogt float %sub, %same.as.v1 - %v1.1 = select i1 %cmp2, float %sub, float %v1 - %phi.to.remove.next = select i1 %cmp2, float %sub, float %same.as.v1 - %inc.i = add nuw nsw i32 %i, 1 - %q.next = getelementptr inbounds i8, ptr %q, i64 4 - %c.next = getelementptr inbounds i8, ptr %c, i64 4 - %exitcond = icmp eq i32 %inc.i, %count - br i1 %exitcond, label %exit, label %for.body - -exit: - %vl.1.lcssa = phi float [ %v1.1, %for.body ] - store float %vl.1.lcssa, ptr @A - ret void -} - -; The difference from select_with_identical_phi() is that the true and false values in -; %same.as.v1, %phi.to.remove.next and %v1.1 are swapped. -; Check that %same.as.v1 can be folded. -define void @select_with_identical_phi_4(ptr %m, ptr %n, i32 %count) { -; CHECK-LABEL: @select_with_identical_phi_4( -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[V0:%.*]] = phi float [ 0x4415AF1D80000000, [[ENTRY:%.*]] ], [ [[V0_1:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[Q:%.*]] = phi ptr [ [[M:%.*]], [[ENTRY]] ], [ [[Q_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[C:%.*]] = phi ptr [ [[N:%.*]], [[ENTRY]] ], [ [[C_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[Q_LOAD:%.*]] = load float, ptr [[Q]], align 4 -; CHECK-NEXT: [[C_LOAD:%.*]] = load float, ptr [[C]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fsub float [[Q_LOAD]], [[C_LOAD]] -; CHECK-NEXT: [[CMP1:%.*]] = fcmp olt float [[SUB]], [[V0]] -; CHECK-NEXT: [[V0_1]] = select i1 [[CMP1]], float [[SUB]], float [[V0]] -; CHECK-NEXT: [[CMP2:%.*]] = fcmp ogt float [[SUB]], [[V1]] -; CHECK-NEXT: [[V1_1]] = select i1 [[CMP2]], float [[V1]], float [[SUB]] -; CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I]], 1 -; CHECK-NEXT: [[Q_NEXT]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 4 -; CHECK-NEXT: [[C_NEXT]] = getelementptr inbounds nuw i8, ptr [[C]], i64 4 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], [[COUNT:%.*]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]] -; CHECK: exit: -; CHECK-NEXT: store float [[V1_1]], ptr @A, align 4 -; CHECK-NEXT: ret void -; -entry: - br label %for.body - -for.body: ; preds = %entry, %for.body - %v0 = phi float [ 0x4415AF1D80000000, %entry ], [ %v0.1, %for.body ] - %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ] - %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ] - %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ] - %q = phi ptr [ %m, %entry ], [ %q.next, %for.body ] - %c = phi ptr [ %n, %entry ], [ %c.next, %for.body ] - %q.load = load float, ptr %q - %c.load = load float, ptr %c - %sub = fsub float %q.load, %c.load - %cmp1 = fcmp olt float %sub, %v0 - %v0.1 = select i1 %cmp1, float %sub, float %v0 - %same.as.v1 = select i1 %cmp1, float %phi.to.remove, float %v1 - %cmp2 = fcmp ogt float %sub, %same.as.v1 - %v1.1 = select i1 %cmp2, float %v1, float %sub - %phi.to.remove.next = select i1 %cmp2, float %same.as.v1, float %sub - %inc.i = add nuw nsw i32 %i, 1 - %q.next = getelementptr inbounds i8, ptr %q, i64 4 - %c.next = getelementptr inbounds i8, ptr %c, i64 4 - %exitcond = icmp eq i32 %inc.i, %count - br i1 %exitcond, label %exit, label %for.body - -exit: - %vl.1.lcssa = phi float [ %v1.1, %for.body ] - store float %vl.1.lcssa, ptr @A - ret void -} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll index 11cc971..fb7890a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll @@ -417,21 +417,17 @@ for.end: ; preds = %for.body, %entry ; Note: This test was added to ensure we always check the legality of reductions (end emit a warning if necessary) before checking for memory dependencies ; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. -; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2) +; CHECK-REMARK: Ignoring user-specified interleave count due to possibly unsafe dependencies in the loop. +; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 1) define i32 @memory_dependence(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i64 %n) { ; CHECK-LABEL: @memory_dependence ; CHECK: vector.body: ; CHECK: %[[LOAD1:.*]] = load <4 x i32> ; CHECK: %[[LOAD2:.*]] = load <4 x i32> -; CHECK: %[[LOAD3:.*]] = load <4 x i32> -; CHECK: %[[LOAD4:.*]] = load <4 x i32> -; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD3]], %[[LOAD1]] -; CHECK: %[[ADD2:.*]] = add nsw <4 x i32> %[[LOAD4]], %[[LOAD2]] -; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD3]] -; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD4]] +; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD2]], %[[LOAD1]] +; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD2]] ; CHECK: middle.block: -; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]] -; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]]) +; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[MUL1]]) entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll index ae8dc2d..005ca8c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll @@ -175,18 +175,28 @@ define void @test_add_double_same_var_args_1(ptr %res, ptr noalias %A, ptr noali ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP0]] -; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP1]], align 4 -; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = load <2 x double>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2> +; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3> +; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2> +; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3> +; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[STRIDED_VEC3]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[STRIDED_VEC1]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[STRIDED_VEC4]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]] -; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[TMP7]], align 4 -; CHECK-NEXT: store <2 x double> [[TMP6]], ptr [[TMP8]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP8]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 ; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: @@ -227,18 +237,28 @@ define void @test_add_double_same_var_args_2(ptr %res, ptr noalias %A, ptr noali ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP0]] -; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP1]], align 4 -; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = load <2 x double>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2> +; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3> +; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2> +; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3> +; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC]] +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC3]] ; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC1]] ; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC4]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]] -; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[TMP7]], align 4 -; CHECK-NEXT: store <2 x double> [[TMP6]], ptr [[TMP8]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP8]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 ; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll index b23702d..2a19402 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll @@ -319,46 +319,46 @@ define void @single_fmul_used_by_each_member(ptr noalias %A, ptr noalias %B, ptr ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], 6 ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr double, ptr [[A]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP33:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP21]] -; CHECK-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP20]] -; CHECK-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP22]] -; CHECK-NEXT: [[TMP24:%.*]] = load double, ptr [[TMP23]], align 8 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x double> poison, double [[TMP24]], i64 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT1]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP25:%.*]] = load double, ptr [[TMP33]], align 8 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <2 x double> poison, double [[TMP25]], i64 0 -; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT12]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP26:%.*]] = load double, ptr [[TMP37]], align 8 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <2 x double> poison, double [[TMP26]], i64 0 -; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT14]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP27:%.*]] = load double, ptr [[TMP39]], align 8 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <2 x double> poison, double [[TMP27]], i64 0 -; CHECK-NEXT: [[WIDE_LOAD14:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT16]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr double, ptr [[TMP23]], i32 2 +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[TMP23]], i32 4 +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr double, ptr [[TMP23]], i32 6 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP23]], align 8 +; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <2 x double>, ptr [[TMP25]], align 8 +; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = load <2 x double>, ptr [[TMP26]], align 8 +; CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <2 x double>, ptr [[TMP27]], align 8 ; CHECK-NEXT: [[TMP28:%.*]] = fmul <2 x double> [[WIDE_LOAD]], splat (double 5.000000e+00) ; CHECK-NEXT: [[TMP29:%.*]] = fmul <2 x double> [[WIDE_LOAD12]], splat (double 5.000000e+00) ; CHECK-NEXT: [[TMP30:%.*]] = fmul <2 x double> [[WIDE_LOAD13]], splat (double 5.000000e+00) ; CHECK-NEXT: [[TMP31:%.*]] = fmul <2 x double> [[WIDE_LOAD14]], splat (double 5.000000e+00) ; CHECK-NEXT: [[TMP32:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP20]] ; CHECK-NEXT: [[TMP34:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP21]] -; CHECK-NEXT: [[TMP38:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP20]] ; CHECK-NEXT: [[TMP35:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP22]] -; CHECK-NEXT: store <2 x double> [[TMP28]], ptr [[TMP32]], align 8 -; CHECK-NEXT: store <2 x double> [[TMP29]], ptr [[TMP34]], align 8 -; CHECK-NEXT: store <2 x double> [[TMP30]], ptr [[TMP38]], align 8 -; CHECK-NEXT: store <2 x double> [[TMP31]], ptr [[TMP35]], align 8 +; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <2 x double> [[TMP28]], <2 x double> [[TMP28]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP36]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP32]], align 8 +; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <2 x double> [[TMP29]], <2 x double> [[TMP29]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[INTERLEAVED_VEC15:%.*]] = shufflevector <4 x double> [[TMP37]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC15]], ptr [[TMP33]], align 8 +; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <2 x double> [[TMP30]], <2 x double> [[TMP30]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[INTERLEAVED_VEC16:%.*]] = shufflevector <4 x double> [[TMP38]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC16]], ptr [[TMP34]], align 8 +; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <2 x double> [[TMP31]], <2 x double> [[TMP31]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[INTERLEAVED_VEC17:%.*]] = shufflevector <4 x double> [[TMP39]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC17]], ptr [[TMP35]], align 8 ; CHECK-NEXT: [[TMP40:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP42:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP21]] ; CHECK-NEXT: [[TMP41:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP42:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP21]] ; CHECK-NEXT: [[TMP43:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP22]] -; CHECK-NEXT: store <2 x double> [[TMP28]], ptr [[TMP40]], align 8 -; CHECK-NEXT: store <2 x double> [[TMP29]], ptr [[TMP42]], align 8 -; CHECK-NEXT: store <2 x double> [[TMP30]], ptr [[TMP41]], align 8 -; CHECK-NEXT: store <2 x double> [[TMP31]], ptr [[TMP43]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP40]], align 8 +; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC15]], ptr [[TMP41]], align 8 +; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC16]], ptr [[TMP42]], align 8 +; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC17]], ptr [[TMP43]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP44]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: @@ -435,7 +435,7 @@ exit: ret void } -; We should interleave by 2 after narrowing interleave groups to saturate +; FIXME: We should interleave by 2 after narrowing interleave groups to saturate ; load/store units. define void @test_interleave_after_narrowing(i32 %n, ptr %x, ptr noalias %y) { ; CHECK-LABEL: define void @test_interleave_after_narrowing( @@ -447,18 +447,12 @@ define void @test_interleave_after_narrowing(i32 %n, ptr %x, ptr noalias %y) { ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 4 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[TMP5]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP7]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = fneg <4 x float> [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP4:%.*]] = fneg <4 x float> [[WIDE_LOAD1]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[TMP5]] ; CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[TMP2]], align 4 -; CHECK-NEXT: store <4 x float> [[TMP4]], ptr [[TMP6]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 ; CHECK-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll index 2865495..c261760 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll @@ -13,8 +13,12 @@ define void @load_store_interleave_group_tc_2(ptr noalias %data) { ; VF2: [[VECTOR_PH]]: ; VF2-NEXT: br label %[[VECTOR_BODY:.*]] ; VF2: [[VECTOR_BODY]]: -; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[DATA]], align 8 -; VF2-NEXT: store <2 x i64> [[WIDE_LOAD]], ptr [[DATA]], align 8 +; VF2-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[DATA]], align 8 +; VF2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2> +; VF2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3> +; VF2-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[DATA]], align 8 ; VF2-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; VF2: [[MIDDLE_BLOCK]]: ; VF2-NEXT: br label %[[EXIT:.*]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll index 829acbbf..b63e03d 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll @@ -1,81 +1,29 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 5 -; RUN: opt -p loop-vectorize -force-vector-interleave=1 -S -mcpu=neoverse-512tvb %s | FileCheck --check-prefixes=IC1 %s -; RUN: opt -p loop-vectorize -S -mcpu=neoverse-512tvb %s | FileCheck --check-prefixes=CHECK %s +; RUN: opt -p loop-vectorize -force-vector-interleave=1 -S -mcpu=neoverse-512tvb %s | FileCheck --check-prefixes=CHECK %s target triple = "aarch64-unknown-linux" define void @load_store_interleave_group(ptr noalias %data) { -; IC1-LABEL: define void @load_store_interleave_group( -; IC1-SAME: ptr noalias [[DATA:%.*]]) #[[ATTR0:[0-9]+]] { -; IC1-NEXT: [[ENTRY:.*:]] -; IC1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; IC1-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1 -; IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP1]] -; IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; IC1: [[VECTOR_PH]]: -; IC1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; IC1-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 100, [[TMP3]] -; IC1-NEXT: [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]] -; IC1-NEXT: br label %[[VECTOR_BODY:.*]] -; IC1: [[VECTOR_BODY]]: -; IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; IC1-NEXT: [[TMP4:%.*]] = shl nsw i64 [[INDEX]], 1 -; IC1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]] -; IC1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP5]], align 8 -; IC1-NEXT: store <vscale x 2 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 8 -; IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]] -; IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; IC1: [[MIDDLE_BLOCK]]: -; IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 100, [[N_VEC]] -; IC1-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] -; IC1: [[SCALAR_PH]]: -; ; CHECK-LABEL: define void @load_store_interleave_group( ; CHECK-SAME: ptr noalias [[DATA:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 3 +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP5]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 100, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]] -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP2]], 4 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP20]], 1 -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP2]], 2 -; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP24]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 1 -; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP2]], 3 -; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP12]], 0 -; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 1 -; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], [[TMP14]] ; CHECK-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP17:%.*]] = shl nsw i64 [[TMP7]], 1 -; CHECK-NEXT: [[TMP18:%.*]] = shl nsw i64 [[TMP11]], 1 -; CHECK-NEXT: [[TMP19:%.*]] = shl nsw i64 [[TMP15]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP17]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP18]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP19]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP1]], align 8 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP21]], align 8 -; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i64>, ptr [[TMP22]], align 8 -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i64>, ptr [[TMP23]], align 8 ; CHECK-NEXT: store <vscale x 2 x i64> [[WIDE_LOAD]], ptr [[TMP1]], align 8 -; CHECK-NEXT: store <vscale x 2 x i64> [[WIDE_LOAD1]], ptr [[TMP21]], align 8 -; CHECK-NEXT: store <vscale x 2 x i64> [[WIDE_LOAD2]], ptr [[TMP22]], align 8 -; CHECK-NEXT: store <vscale x 2 x i64> [[WIDE_LOAD3]], ptr [[TMP23]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: @@ -105,82 +53,27 @@ exit: } define void @test_2xi64_unary_op_load_interleave_group(ptr noalias %data, ptr noalias %factor) { -; IC1-LABEL: define void @test_2xi64_unary_op_load_interleave_group( -; IC1-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) #[[ATTR0]] { -; IC1-NEXT: [[ENTRY:.*:]] -; IC1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; IC1-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1 -; IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1111, [[TMP1]] -; IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; IC1: [[VECTOR_PH]]: -; IC1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; IC1-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1111, [[TMP3]] -; IC1-NEXT: [[N_VEC:%.*]] = sub i64 1111, [[N_MOD_VF]] -; IC1-NEXT: br label %[[VECTOR_BODY:.*]] -; IC1: [[VECTOR_BODY]]: -; IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; IC1-NEXT: [[TMP4:%.*]] = shl nsw i64 [[INDEX]], 1 -; IC1-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP4]] -; IC1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr [[TMP5]], align 8 -; IC1-NEXT: [[TMP6:%.*]] = fneg <vscale x 2 x double> [[WIDE_LOAD]] -; IC1-NEXT: store <vscale x 2 x double> [[TMP6]], ptr [[TMP5]], align 8 -; IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]] -; IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IC1-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] -; IC1: [[MIDDLE_BLOCK]]: -; IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1111, [[N_VEC]] -; IC1-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] -; IC1: [[SCALAR_PH]]: -; ; CHECK-LABEL: define void @test_2xi64_unary_op_load_interleave_group( ; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 3 +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1111, [[TMP5]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1111, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1111, [[N_MOD_VF]] -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP2]], 4 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP20]], 1 -; CHECK-NEXT: [[TMP24:%.*]] = add i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP2]], 2 -; CHECK-NEXT: [[TMP28:%.*]] = add i64 [[TMP8]], 0 -; CHECK-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 1 -; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP29]] -; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP2]], 3 -; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP12]], 0 -; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 1 -; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], [[TMP14]] ; CHECK-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP17:%.*]] = shl nsw i64 [[TMP24]], 1 -; CHECK-NEXT: [[TMP18:%.*]] = shl nsw i64 [[TMP11]], 1 -; CHECK-NEXT: [[TMP19:%.*]] = shl nsw i64 [[TMP15]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP17]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP18]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP19]] ; CHECK-NEXT: [[TMP7:%.*]] = load <vscale x 2 x double>, ptr [[TMP1]], align 8 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 2 x double>, ptr [[TMP21]], align 8 -; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 2 x double>, ptr [[TMP22]], align 8 -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 2 x double>, ptr [[TMP23]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = fneg <vscale x 2 x double> [[TMP7]] -; CHECK-NEXT: [[TMP25:%.*]] = fneg <vscale x 2 x double> [[WIDE_LOAD1]] -; CHECK-NEXT: [[TMP26:%.*]] = fneg <vscale x 2 x double> [[WIDE_LOAD2]] -; CHECK-NEXT: [[TMP27:%.*]] = fneg <vscale x 2 x double> [[WIDE_LOAD3]] ; CHECK-NEXT: store <vscale x 2 x double> [[TMP9]], ptr [[TMP1]], align 8 -; CHECK-NEXT: store <vscale x 2 x double> [[TMP25]], ptr [[TMP21]], align 8 -; CHECK-NEXT: store <vscale x 2 x double> [[TMP26]], ptr [[TMP22]], align 8 -; CHECK-NEXT: store <vscale x 2 x double> [[TMP27]], ptr [[TMP23]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]] ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: @@ -210,3 +103,175 @@ loop: exit: ret void } + +define void @test_masked_interleave_group(i32 %N, ptr %mask, ptr %src, ptr %dst) { +; IC1-LABEL: define void @test_masked_interleave_group( +; IC1-SAME: i32 [[N:%.*]], ptr [[MASK:%.*]], ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] { +; IC1-NEXT: [[ENTRY:.*:]] +; IC1-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; IC1-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1 +; IC1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; IC1-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2 +; IC1-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP3]], i64 8) +; IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[UMAX]] +; IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; IC1: [[VECTOR_MEMCHECK]]: +; IC1-NEXT: [[TMP4:%.*]] = zext i32 [[N]] to i64 +; IC1-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4 +; IC1-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 16 +; IC1-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]] +; IC1-NEXT: [[TMP7:%.*]] = add nuw nsw i64 [[TMP4]], 1 +; IC1-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[TMP7]] +; IC1-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP6]] +; IC1-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; IC1-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[MASK]], [[SCEVGEP]] +; IC1-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; IC1-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; IC1-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; IC1-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; IC1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; IC1-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; IC1: [[VECTOR_PH]]: +; IC1-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; IC1-NEXT: [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4 +; IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP9]] +; IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] +; IC1-NEXT: [[TMP10:%.*]] = trunc i64 [[N_VEC]] to i32 +; IC1-NEXT: [[TMP11:%.*]] = mul i64 [[N_VEC]], 16 +; IC1-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]] +; IC1-NEXT: [[TMP13:%.*]] = mul i64 [[N_VEC]], 16 +; IC1-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]] +; IC1-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[N_VEC]] +; IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; IC1: [[VECTOR_BODY]]: +; IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC1-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16 +; IC1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]] +; IC1-NEXT: [[OFFSET_IDX6:%.*]] = mul i64 [[INDEX]], 16 +; IC1-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX6]] +; IC1-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[INDEX]] +; IC1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[NEXT_GEP8]], align 1, !alias.scope [[META6:![0-9]+]] +; IC1-NEXT: [[TMP16:%.*]] = icmp eq <vscale x 4 x i8> [[WIDE_LOAD]], zeroinitializer +; IC1-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]]) +; IC1-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 16 x float> @llvm.masked.load.nxv16f32.p0(ptr align 4 [[NEXT_GEP7]], <vscale x 16 x i1> [[INTERLEAVED_MASK]], <vscale x 16 x float> poison), !alias.scope [[META9:![0-9]+]] +; IC1-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave4.nxv16f32(<vscale x 16 x float> [[WIDE_MASKED_VEC]]) +; IC1-NEXT: [[TMP17:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 0 +; IC1-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 1 +; IC1-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 2 +; IC1-NEXT: [[TMP20:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 3 +; IC1-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 16 x float> @llvm.vector.interleave4.nxv16f32(<vscale x 4 x float> [[TMP17]], <vscale x 4 x float> [[TMP18]], <vscale x 4 x float> [[TMP19]], <vscale x 4 x float> [[TMP20]]) +; IC1-NEXT: [[INTERLEAVED_MASK9:%.*]] = call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]]) +; IC1-NEXT: call void @llvm.masked.store.nxv16f32.p0(<vscale x 16 x float> [[INTERLEAVED_VEC]], ptr align 4 [[NEXT_GEP]], <vscale x 16 x i1> [[INTERLEAVED_MASK9]]), !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]] +; IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; IC1-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IC1-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; IC1: [[MIDDLE_BLOCK]]: +; IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] +; IC1-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; IC1: [[SCALAR_PH]]: +; +; CHECK-LABEL: define void @test_masked_interleave_group( +; CHECK-SAME: i32 [[N:%.*]], ptr [[MASK:%.*]], ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2 +; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP3]], i64 8) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[UMAX]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 16 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = add nuw nsw i64 [[TMP4]], 1 +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[TMP7]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP6]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[MASK]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP9]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[N_VEC]], 16 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[N_VEC]], 16 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[N_VEC]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[OFFSET_IDX6:%.*]] = mul i64 [[INDEX]], 16 +; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX6]] +; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[NEXT_GEP8]], align 1, !alias.scope [[META6:![0-9]+]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq <vscale x 4 x i8> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]]) +; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 16 x float> @llvm.masked.load.nxv16f32.p0(ptr align 4 [[NEXT_GEP7]], <vscale x 16 x i1> [[INTERLEAVED_MASK]], <vscale x 16 x float> poison), !alias.scope [[META9:![0-9]+]] +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave4.nxv16f32(<vscale x 16 x float> [[WIDE_MASKED_VEC]]) +; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 2 +; CHECK-NEXT: [[TMP20:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 3 +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 16 x float> @llvm.vector.interleave4.nxv16f32(<vscale x 4 x float> [[TMP17]], <vscale x 4 x float> [[TMP18]], <vscale x 4 x float> [[TMP19]], <vscale x 4 x float> [[TMP20]]) +; CHECK-NEXT: [[INTERLEAVED_MASK9:%.*]] = call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv16f32.p0(<vscale x 16 x float> [[INTERLEAVED_VEC]], ptr align 4 [[NEXT_GEP]], <vscale x 16 x i1> [[INTERLEAVED_MASK9]]), !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop.header + +loop.header: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %dst.iv = phi ptr [ %dst, %entry ], [ %dst.iv.next, %loop.latch ] + %src.iv = phi ptr [ %src, %entry ], [ %src.iv.next, %loop.latch ] + %mask.iv = phi ptr [ %mask, %entry ], [ %mask.iv.next, %loop.latch ] + %mask.iv.next = getelementptr i8, ptr %mask.iv, i64 1 + %mask.val = load i8, ptr %mask.iv, align 1 + %should.copy = icmp eq i8 %mask.val, 0 + br i1 %should.copy, label %then, label %loop.latch + +then: + %elem0 = load float, ptr %src.iv, align 4 + store float %elem0, ptr %dst.iv, align 4 + %src.1.ptr = getelementptr i8, ptr %src.iv, i64 4 + %s1 = load float, ptr %src.1.ptr, align 4 + %dst.1.ptr = getelementptr i8, ptr %dst.iv, i64 4 + store float %s1, ptr %dst.1.ptr, align 4 + %src.2.ptr = getelementptr i8, ptr %src.iv, i64 8 + %s2 = load float, ptr %src.2.ptr, align 4 + %dst.2.ptr = getelementptr i8, ptr %dst.iv, i64 8 + store float %s2, ptr %dst.2.ptr, align 4 + %src.3.ptr = getelementptr i8, ptr %src.iv, i64 12 + %s3 = load float, ptr %src.3.ptr, align 4 + %dst.3.ptr = getelementptr i8, ptr %dst.iv, i64 12 + store float %s3, ptr %dst.3.ptr, align 4 + br label %loop.latch + +loop.latch: + %iv.next = add i32 %iv, 1 + %src.iv.next = getelementptr i8, ptr %src.iv, i64 16 + %dst.iv.next = getelementptr i8, ptr %dst.iv, i64 16 + %ec = icmp eq i32 %iv, %N + br i1 %ec, label %exit, label %loop.header + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll index abfb44d..d290f2d 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll @@ -60,26 +60,32 @@ define void @test_2xi64_with_wide_load(ptr noalias %data, ptr noalias %factor) { ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP4]], align 8 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 2 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = load <2 x i64>, ptr [[TMP1]], align 8 +; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8 ; CHECK-NEXT: [[TMP6:%.*]] = shl nsw i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP7:%.*]] = shl nsw i64 [[TMP0]], 1 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP7]] -; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8 -; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP8]], align 8 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2> +; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3> +; CHECK-NEXT: [[WIDE_VEC3:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC3]], <4 x i64> poison, <2 x i32> <i32 0, i32 2> +; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <4 x i64> [[WIDE_VEC3]], <4 x i64> poison, <2 x i32> <i32 1, i32 3> +; CHECK-NEXT: [[TMP10:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP11:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT3]], [[WIDE_LOAD1]] ; CHECK-NEXT: [[TMP15:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT]], [[STRIDED_VEC2]] ; CHECK-NEXT: [[TMP16:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT3]], [[STRIDED_VEC5]] -; CHECK-NEXT: store <2 x i64> [[TMP15]], ptr [[TMP8]], align 8 -; CHECK-NEXT: store <2 x i64> [[TMP16]], ptr [[TMP9]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <2 x i64> [[TMP10]], <2 x i64> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP17]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; CHECK-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8 +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x i64> [[TMP11]], <2 x i64> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[INTERLEAVED_VEC6:%.*]] = shufflevector <4 x i64> [[TMP18]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; CHECK-NEXT: store <4 x i64> [[INTERLEAVED_VEC6]], ptr [[TMP9]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 ; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll index f2e689c..75980ba 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll @@ -328,8 +328,10 @@ define void @same_live_in_store_interleave_group(i64 %x, ptr noalias %dst) { ; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; VF2-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1 ; VF2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]] -; VF2-NEXT: store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 8 -; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1 +; VF2-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLAT]], <2 x i64> [[BROADCAST_SPLAT]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> +; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8 +; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 ; VF2-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; VF2: [[MIDDLE_BLOCK]]: diff --git a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll index c8d20dc..b26e9cf 100644 --- a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll @@ -7,6 +7,7 @@ target triple = "wasm32-unknown-wasi" %struct.TwoInts = type { i32, i32 } %struct.ThreeInts = type { i32, i32, i32 } %struct.FourInts = type { i32, i32, i32, i32 } +%struct.TwoShorts = type { i16, i16 } %struct.ThreeShorts = type { i16, i16, i16 } %struct.FourShorts = type { i16, i16, i16, i16 } %struct.TwoBytes = type { i8, i8 } @@ -14,6 +15,8 @@ target triple = "wasm32-unknown-wasi" %struct.FourBytes = type { i8, i8, i8, i8 } %struct.FiveBytes = type { i8, i8, i8, i8, i8 } %struct.EightBytes = type { i8, i8, i8, i8, i8, i8, i8, i8 } +%struct.TwoFloats = type { float, float } +%struct.FourFloats = type { float, float, float, float } ; CHECK-LABEL: two_ints_same_op ; CHECK: Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 2 at %10 @@ -1350,3 +1353,1000 @@ define hidden void @scale_uv_row_down2_linear(ptr nocapture noundef readonly %0, 34: ; preds = %6, %4 ret void } + +; CHECK-LABEL: two_floats_same_op +; CHECK: LV: Scalar loop costs: 14 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 14. +; CHECK: LV: Vector loop of width 2 costs: 19. +; CHECK: LV: Vector loop of width 4 costs: 15. +; CHECK: LV: Selecting VF: 1. +define hidden void @two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp21.not = icmp eq i32 %N, 0 + br i1 %cmp21.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %arrayidx3 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.022 + store float %mul, ptr %arrayidx3, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %mul8 = fmul float %2, %3 + %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4 + store float %mul8, ptr %y10, align 4 + %inc = add nuw i32 %i.022, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_floats_vary_op +; CHECK: LV: Scalar loop costs: 14 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 14. +; CHECK: LV: Vector loop of width 2 costs: 19. +; CHECK: LV: Vector loop of width 4 costs: 15. +; CHECK: LV: Selecting VF: 1. +define hidden void @two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp20.not = icmp eq i32 %N, 0 + br i1 %cmp20.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.021 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.021 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.021 + %1 = load float, ptr %arrayidx1, align 4 + %add = fadd float %0, %1 + %arrayidx3 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.021 + store float %add, ptr %arrayidx3, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %sub = fsub float %2, %3 + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4 + store float %sub, ptr %y9, align 4 + %inc = add nuw i32 %i.021, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_bytes_two_floats_same_op +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 18 +; CHECK: LV: Vector loop of width 2 costs: 23 +; CHECK: LV: Vector loop of width 4 costs: 13 +; CHECK: LV: Selecting VF: 4. +define hidden void @two_bytes_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp24.not = icmp eq i32 %N, 0 + br i1 %cmp24.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.025 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoBytes, ptr %a, i32 %i.025 + %0 = load i8, ptr %arrayidx, align 1 + %conv = sitofp i8 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.TwoBytes, ptr %b, i32 %i.025 + %1 = load i8, ptr %arrayidx1, align 1 + %conv3 = sitofp i8 %1 to float + %mul = fmul float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.025 + store float %mul, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1 + %2 = load i8, ptr %y, align 1 + %conv7 = sitofp i8 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1 + %3 = load i8, ptr %y9, align 1 + %conv10 = sitofp i8 %3 to float + %mul11 = fmul float %conv7, %conv10 + %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %mul11, ptr %y13, align 4 + %inc = add nuw i32 %i.025, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_bytes_two_floats_vary_op +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 18 +; CHECK: LV: Vector loop of width 2 costs: 23 +; CHECK: LV: Vector loop of width 4 costs: 13 +; CHECK: LV: Selecting VF: 4. +define hidden void @two_bytes_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp23.not = icmp eq i32 %N, 0 + br i1 %cmp23.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.024 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoBytes, ptr %a, i32 %i.024 + %0 = load i8, ptr %arrayidx, align 1 + %conv = sitofp i8 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.TwoBytes, ptr %b, i32 %i.024 + %1 = load i8, ptr %arrayidx1, align 1 + %conv3 = sitofp i8 %1 to float + %add = fadd float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.024 + store float %add, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1 + %2 = load i8, ptr %y, align 1 + %conv7 = sitofp i8 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1 + %3 = load i8, ptr %y9, align 1 + %conv10 = sitofp i8 %3 to float + %sub = fsub float %conv7, %conv10 + %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %sub, ptr %y12, align 4 + %inc = add nuw i32 %i.024, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_floats_two_bytes_same_op +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 16 +; CHECK: LV: Vector loop of width 2 costs: 21 +; CHECK: LV: Vector loop of width 4 costs: 14. +; CHECK: LV: Selecting VF: 4. +define hidden void @two_floats_two_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp22.not = icmp eq i32 %N, 0 + br i1 %cmp22.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.023 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.023 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %conv = fptosi float %mul to i8 + %arrayidx3 = getelementptr inbounds nuw %struct.TwoBytes, ptr %res, i32 %i.023 + store i8 %conv, ptr %arrayidx3, align 1 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %mul8 = fmul float %2, %3 + %conv9 = fptosi float %mul8 to i8 + %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1 + store i8 %conv9, ptr %y11, align 1 + %inc = add nuw i32 %i.023, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_floats_two_bytes_vary_op +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 16 +; CHECK: LV: Vector loop of width 2 costs: 21 +; CHECK: LV: Vector loop of width 4 costs: 14. +; CHECK: LV: Selecting VF: 4. +define hidden void @two_floats_two_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp21.not = icmp eq i32 %N, 0 + br i1 %cmp21.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022 + %1 = load float, ptr %arrayidx1, align 4 + %add = fadd float %0, %1 + %conv = fptosi float %add to i8 + %arrayidx3 = getelementptr inbounds nuw %struct.TwoBytes, ptr %res, i32 %i.022 + store i8 %conv, ptr %arrayidx3, align 1 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %sub = fsub float %2, %3 + %conv8 = fptosi float %sub to i8 + %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1 + store i8 %conv8, ptr %y10, align 1 + %inc = add nuw i32 %i.022, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_shorts_two_floats_same_op +; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 18 +; CHECK: LV: Vector loop of width 2 costs: 22 +; CHECK: LV: Vector loop of width 4 costs: 11. +; CHECK: LV: Selecting VF: 4. +define hidden void @two_shorts_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp24.not = icmp eq i32 %N, 0 + br i1 %cmp24.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.025 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoShorts, ptr %a, i32 %i.025 + %0 = load i16, ptr %arrayidx, align 2 + %conv = sitofp i16 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.TwoShorts, ptr %b, i32 %i.025 + %1 = load i16, ptr %arrayidx1, align 2 + %conv3 = sitofp i16 %1 to float + %mul = fmul float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.025 + store float %mul, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 + %2 = load i16, ptr %y, align 2 + %conv7 = sitofp i16 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 + %3 = load i16, ptr %y9, align 2 + %conv10 = sitofp i16 %3 to float + %mul11 = fmul float %conv7, %conv10 + %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %mul11, ptr %y13, align 4 + %inc = add nuw i32 %i.025, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_shorts_two_floats_vary_op +; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 18 +; CHECK: LV: Vector loop of width 2 costs: 22 +; CHECK: LV: Vector loop of width 4 costs: 11. +; CHECK: LV: Selecting VF: 4. +define hidden void @two_shorts_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp23.not = icmp eq i32 %N, 0 + br i1 %cmp23.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.024 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoShorts, ptr %a, i32 %i.024 + %0 = load i16, ptr %arrayidx, align 2 + %conv = sitofp i16 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.TwoShorts, ptr %b, i32 %i.024 + %1 = load i16, ptr %arrayidx1, align 2 + %conv3 = sitofp i16 %1 to float + %add = fadd float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.024 + store float %add, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 + %2 = load i16, ptr %y, align 2 + %conv7 = sitofp i16 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 + %3 = load i16, ptr %y9, align 2 + %conv10 = sitofp i16 %3 to float + %sub = fsub float %conv7, %conv10 + %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %sub, ptr %y12, align 4 + %inc = add nuw i32 %i.024, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_floats_two_shorts_same_op +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 16 +; CHECK: LV: Vector loop of width 2 costs: 20 +; CHECK: LV: Vector loop of width 4 costs: 13. +; CHECK: LV: Selecting VF: 4. +define hidden void @two_floats_two_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp22.not = icmp eq i32 %N, 0 + br i1 %cmp22.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.023 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.023 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %conv = fptosi float %mul to i16 + %arrayidx3 = getelementptr inbounds nuw %struct.TwoShorts, ptr %res, i32 %i.023 + store i16 %conv, ptr %arrayidx3, align 2 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %mul8 = fmul float %2, %3 + %conv9 = fptosi float %mul8 to i16 + %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2 + store i16 %conv9, ptr %y11, align 2 + %inc = add nuw i32 %i.023, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_floats_two_shorts_vary_op +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 16 +; CHECK: LV: Vector loop of width 2 costs: 20 +; CHECK: LV: Vector loop of width 4 costs: 13. +; CHECK: LV: Selecting VF: 4. +define hidden void @two_floats_two_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp21.not = icmp eq i32 %N, 0 + br i1 %cmp21.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022 + %1 = load float, ptr %arrayidx1, align 4 + %add = fadd float %0, %1 + %conv = fptosi float %add to i16 + %arrayidx3 = getelementptr inbounds nuw %struct.TwoShorts, ptr %res, i32 %i.022 + store i16 %conv, ptr %arrayidx3, align 2 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %sub = fsub float %2, %3 + %conv8 = fptosi float %sub to i16 + %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2 + store i16 %conv8, ptr %y10, align 2 + %inc = add nuw i32 %i.022, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_floats_same_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 24 +; CHECK: LV: Vector loop of width 2 costs: 33 +; CHECK: LV: Vector loop of width 4 costs: 30 +; CHECK: LV: Selecting VF: 1 +define hidden void @four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp45.not = icmp eq i32 %N, 0 + br i1 %cmp45.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %arrayidx3 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.046 + store float %mul, ptr %arrayidx3, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %mul8 = fmul float %2, %3 + %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4 + store float %mul8, ptr %y10, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8 + %4 = load float, ptr %z, align 4 + %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8 + %5 = load float, ptr %z13, align 4 + %mul14 = fmul float %4, %5 + %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 8 + store float %mul14, ptr %z16, align 4 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12 + %6 = load float, ptr %w, align 4 + %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12 + %7 = load float, ptr %w19, align 4 + %mul20 = fmul float %6, %7 + %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 12 + store float %mul20, ptr %w22, align 4 + %inc = add nuw i32 %i.046, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_floats_vary_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 24 +; CHECK: LV: Vector loop of width 2 costs: 33 +; CHECK: LV: Vector loop of width 4 costs: 30 +; CHECK: LV: Selecting VF: 1 +define hidden void @four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp42.not = icmp eq i32 %N, 0 + br i1 %cmp42.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.043 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.043 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.043 + %1 = load float, ptr %arrayidx1, align 4 + %add = fadd float %0, %1 + %arrayidx3 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.043 + store float %add, ptr %arrayidx3, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %sub = fsub float %2, %3 + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4 + store float %sub, ptr %y9, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8 + %4 = load float, ptr %z, align 4 + %z12 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8 + %5 = load float, ptr %z12, align 4 + %mul = fmul float %4, %5 + %z14 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 8 + store float %mul, ptr %z14, align 4 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12 + %6 = load float, ptr %w, align 4 + %w17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12 + %7 = load float, ptr %w17, align 4 + %div = fdiv float %6, %7 + %w19 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 12 + store float %div, ptr %w19, align 4 + %inc = add nuw i32 %i.043, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_bytes_four_floats_same_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 32 +; CHECK: LV: Vector loop of width 2 costs: 43 +; CHECK: LV: Vector loop of width 4 costs: 23 +; CHECK: LV: Selecting VF: 4 +define hidden void @four_bytes_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp52.not = icmp eq i32 %N, 0 + br i1 %cmp52.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.053 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourBytes, ptr %a, i32 %i.053 + %0 = load i8, ptr %arrayidx, align 1 + %conv = sitofp i8 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.FourBytes, ptr %b, i32 %i.053 + %1 = load i8, ptr %arrayidx1, align 1 + %conv3 = sitofp i8 %1 to float + %mul = fmul float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.053 + store float %mul, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1 + %2 = load i8, ptr %y, align 1 + %conv7 = sitofp i8 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1 + %3 = load i8, ptr %y9, align 1 + %conv10 = sitofp i8 %3 to float + %mul11 = fmul float %conv7, %conv10 + %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %mul11, ptr %y13, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 + %4 = load i8, ptr %z, align 1 + %conv15 = sitofp i8 %4 to float + %z17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 + %5 = load i8, ptr %z17, align 1 + %conv18 = sitofp i8 %5 to float + %mul19 = fmul float %conv15, %conv18 + %z21 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8 + store float %mul19, ptr %z21, align 4 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 3 + %6 = load i8, ptr %w, align 1 + %conv23 = sitofp i8 %6 to float + %w25 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 3 + %7 = load i8, ptr %w25, align 1 + %conv26 = sitofp i8 %7 to float + %mul27 = fmul float %conv23, %conv26 + %w29 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12 + store float %mul27, ptr %w29, align 4 + %inc = add nuw i32 %i.053, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_bytes_four_floats_vary_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 32 +; CHECK: LV: Vector loop of width 2 costs: 43 +; CHECK: LV: Vector loop of width 4 costs: 23 +; CHECK: LV: Selecting VF: 4 +define hidden void @four_bytes_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp49.not = icmp eq i32 %N, 0 + br i1 %cmp49.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.050 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourBytes, ptr %a, i32 %i.050 + %0 = load i8, ptr %arrayidx, align 1 + %conv = sitofp i8 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.FourBytes, ptr %b, i32 %i.050 + %1 = load i8, ptr %arrayidx1, align 1 + %conv3 = sitofp i8 %1 to float + %mul = fmul float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.050 + store float %mul, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1 + %2 = load i8, ptr %y, align 1 + %conv7 = sitofp i8 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1 + %3 = load i8, ptr %y9, align 1 + %conv10 = sitofp i8 %3 to float + %add = fadd float %conv7, %conv10 + %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %add, ptr %y12, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 + %4 = load i8, ptr %z, align 1 + %conv14 = sitofp i8 %4 to float + %z16 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 + %5 = load i8, ptr %z16, align 1 + %conv17 = sitofp i8 %5 to float + %div = fdiv float %conv14, %conv17 + %z19 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8 + store float %div, ptr %z19, align 4 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 3 + %6 = load i8, ptr %w, align 1 + %conv21 = sitofp i8 %6 to float + %w23 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 3 + %7 = load i8, ptr %w23, align 1 + %conv24 = sitofp i8 %7 to float + %sub = fsub float %conv21, %conv24 + %w26 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12 + store float %sub, ptr %w26, align 4 + %inc = add nuw i32 %i.050, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_floats_four_bytes_same_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 28 +; CHECK: LV: Vector loop of width 2 costs: 38 +; CHECK: LV: Vector loop of width 4 costs: 26 +; CHECK: LV: Selecting VF: 4 +define hidden void @four_floats_four_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp48.not = icmp eq i32 %N, 0 + br i1 %cmp48.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.049 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.049 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.049 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %conv = fptosi float %mul to i8 + %arrayidx3 = getelementptr inbounds nuw %struct.FourBytes, ptr %res, i32 %i.049 + store i8 %conv, ptr %arrayidx3, align 1 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %mul8 = fmul float %2, %3 + %conv9 = fptosi float %mul8 to i8 + %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1 + store i8 %conv9, ptr %y11, align 1 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8 + %4 = load float, ptr %z, align 4 + %z14 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8 + %5 = load float, ptr %z14, align 4 + %mul15 = fmul float %4, %5 + %conv16 = fptosi float %mul15 to i8 + %z18 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2 + store i8 %conv16, ptr %z18, align 1 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12 + %6 = load float, ptr %w, align 4 + %w21 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12 + %7 = load float, ptr %w21, align 4 + %mul22 = fmul float %6, %7 + %conv23 = fptosi float %mul22 to i8 + %w25 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 3 + store i8 %conv23, ptr %w25, align 1 + %inc = add nuw i32 %i.049, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_floats_four_bytes_vary_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 28 +; CHECK: LV: Vector loop of width 2 costs: 38 +; CHECK: LV: Vector loop of width 4 costs: 26 +; CHECK: LV: Selecting VF: 4 +define hidden void @four_floats_four_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp45.not = icmp eq i32 %N, 0 + br i1 %cmp45.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %conv = fptosi float %mul to i8 + %arrayidx3 = getelementptr inbounds nuw %struct.FourBytes, ptr %res, i32 %i.046 + store i8 %conv, ptr %arrayidx3, align 1 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %add = fadd float %2, %3 + %conv8 = fptosi float %add to i8 + %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1 + store i8 %conv8, ptr %y10, align 1 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8 + %4 = load float, ptr %z, align 4 + %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8 + %5 = load float, ptr %z13, align 4 + %div = fdiv float %4, %5 + %conv14 = fptosi float %div to i8 + %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2 + store i8 %conv14, ptr %z16, align 1 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12 + %6 = load float, ptr %w, align 4 + %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12 + %7 = load float, ptr %w19, align 4 + %sub = fsub float %6, %7 + %conv20 = fptosi float %sub to i8 + %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 3 + store i8 %conv20, ptr %w22, align 1 + %inc = add nuw i32 %i.046, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_shorts_four_floats_same_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 32 +; CHECK: LV: Vector loop of width 2 costs: 37 +; CHECK: LV: Vector loop of width 4 costs: 23 +; CHECK: LV: Selecting VF: 4 +define hidden void @four_shorts_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp52.not = icmp eq i32 %N, 0 + br i1 %cmp52.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.053 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourShorts, ptr %a, i32 %i.053 + %0 = load i16, ptr %arrayidx, align 2 + %conv = sitofp i16 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.FourShorts, ptr %b, i32 %i.053 + %1 = load i16, ptr %arrayidx1, align 2 + %conv3 = sitofp i16 %1 to float + %mul = fmul float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.053 + store float %mul, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 + %2 = load i16, ptr %y, align 2 + %conv7 = sitofp i16 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 + %3 = load i16, ptr %y9, align 2 + %conv10 = sitofp i16 %3 to float + %mul11 = fmul float %conv7, %conv10 + %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %mul11, ptr %y13, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %4 = load i16, ptr %z, align 2 + %conv15 = sitofp i16 %4 to float + %z17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %5 = load i16, ptr %z17, align 2 + %conv18 = sitofp i16 %5 to float + %mul19 = fmul float %conv15, %conv18 + %z21 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8 + store float %mul19, ptr %z21, align 4 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 6 + %6 = load i16, ptr %w, align 2 + %conv23 = sitofp i16 %6 to float + %w25 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 6 + %7 = load i16, ptr %w25, align 2 + %conv26 = sitofp i16 %7 to float + %mul27 = fmul float %conv23, %conv26 + %w29 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12 + store float %mul27, ptr %w29, align 4 + %inc = add nuw i32 %i.053, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_shorts_four_floats_vary_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 32 +; CHECK: LV: Vector loop of width 2 costs: 37 +; CHECK: LV: Vector loop of width 4 costs: 23 +; CHECK: LV: Selecting VF: 4 +define hidden void @four_shorts_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp49.not = icmp eq i32 %N, 0 + br i1 %cmp49.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.050 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourShorts, ptr %a, i32 %i.050 + %0 = load i16, ptr %arrayidx, align 2 + %conv = sitofp i16 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.FourShorts, ptr %b, i32 %i.050 + %1 = load i16, ptr %arrayidx1, align 2 + %conv3 = sitofp i16 %1 to float + %mul = fmul float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.050 + store float %mul, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 + %2 = load i16, ptr %y, align 2 + %conv7 = sitofp i16 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 + %3 = load i16, ptr %y9, align 2 + %conv10 = sitofp i16 %3 to float + %add = fadd float %conv7, %conv10 + %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %add, ptr %y12, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %4 = load i16, ptr %z, align 2 + %conv14 = sitofp i16 %4 to float + %z16 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %5 = load i16, ptr %z16, align 2 + %conv17 = sitofp i16 %5 to float + %div = fdiv float %conv14, %conv17 + %z19 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8 + store float %div, ptr %z19, align 4 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 6 + %6 = load i16, ptr %w, align 2 + %conv21 = sitofp i16 %6 to float + %w23 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 6 + %7 = load i16, ptr %w23, align 2 + %conv24 = sitofp i16 %7 to float + %sub = fsub float %conv21, %conv24 + %w26 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12 + store float %sub, ptr %w26, align 4 + %inc = add nuw i32 %i.050, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_floats_four_shorts_same_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 28 +; CHECK: LV: Vector loop of width 2 costs: 35 +; CHECK: LV: Vector loop of width 4 costs: 26 +; CHECK: LV: Selecting VF: 4 +define hidden void @four_floats_four_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp48.not = icmp eq i32 %N, 0 + br i1 %cmp48.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.049 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.049 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.049 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %conv = fptosi float %mul to i16 + %arrayidx3 = getelementptr inbounds nuw %struct.FourShorts, ptr %res, i32 %i.049 + store i16 %conv, ptr %arrayidx3, align 2 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %mul8 = fmul float %2, %3 + %conv9 = fptosi float %mul8 to i16 + %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2 + store i16 %conv9, ptr %y11, align 2 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8 + %4 = load float, ptr %z, align 4 + %z14 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8 + %5 = load float, ptr %z14, align 4 + %mul15 = fmul float %4, %5 + %conv16 = fptosi float %mul15 to i16 + %z18 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4 + store i16 %conv16, ptr %z18, align 2 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12 + %6 = load float, ptr %w, align 4 + %w21 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12 + %7 = load float, ptr %w21, align 4 + %mul22 = fmul float %6, %7 + %conv23 = fptosi float %mul22 to i16 + %w25 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 6 + store i16 %conv23, ptr %w25, align 2 + %inc = add nuw i32 %i.049, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_floats_four_shorts_vary_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 28 +; CHECK: LV: Vector loop of width 2 costs: 35 +; CHECK: LV: Vector loop of width 4 costs: 26 +; CHECK: LV: Selecting VF: 4 +define hidden void @four_floats_four_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp45.not = icmp eq i32 %N, 0 + br i1 %cmp45.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %conv = fptosi float %mul to i16 + %arrayidx3 = getelementptr inbounds nuw %struct.FourShorts, ptr %res, i32 %i.046 + store i16 %conv, ptr %arrayidx3, align 2 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %add = fadd float %2, %3 + %conv8 = fptosi float %add to i16 + %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2 + store i16 %conv8, ptr %y10, align 2 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8 + %4 = load float, ptr %z, align 4 + %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8 + %5 = load float, ptr %z13, align 4 + %div = fdiv float %4, %5 + %conv14 = fptosi float %div to i16 + %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4 + store i16 %conv14, ptr %z16, align 2 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12 + %6 = load float, ptr %w, align 4 + %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12 + %7 = load float, ptr %w19, align 4 + %sub = fsub float %6, %7 + %conv20 = fptosi float %sub to i16 + %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 6 + store i16 %conv20, ptr %w22, align 2 + %inc = add nuw i32 %i.046, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} diff --git a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll index f4d80af..2a3ce03 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll @@ -8,69 +8,15 @@ target triple = "x86_64-unknown-linux" define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) { ; CHECK-LABEL: define void @test_4xi64( ; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[ITER_CHECK:.*]]: +; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] -; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: -; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 16 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[N_MOD_VF1:%.*]] = urem i64 [[N]], 16 -; CHECK-NEXT: [[N_VEC1:%.*]] = sub i64 [[N]], [[N_MOD_VF1]] -; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] -; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT1:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP20]], align 8 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP7]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP21]], align 8 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i64> poison, i64 [[TMP8]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT5]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP22]], align 8 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i64> poison, i64 [[TMP9]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT7]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x i64> poison, i64 [[TMP10]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT9]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[INDEX]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[TMP2]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8 -; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP12]], align 8 -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP13]], align 8 -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP23]], align 8 -; CHECK-NEXT: [[TMP15:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT1]], [[WIDE_LOAD1]] -; CHECK-NEXT: [[TMP16:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT6]], [[WIDE_LOAD2]] -; CHECK-NEXT: [[TMP17:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT8]], [[WIDE_LOAD3]] -; CHECK-NEXT: [[TMP18:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT10]], [[WIDE_LOAD4]] -; CHECK-NEXT: store <4 x i64> [[TMP15]], ptr [[TMP11]], align 8 -; CHECK-NEXT: store <4 x i64> [[TMP16]], ptr [[TMP12]], align 8 -; CHECK-NEXT: store <4 x i64> [[TMP17]], ptr [[TMP13]], align 8 -; CHECK-NEXT: store <4 x i64> [[TMP18]], ptr [[TMP23]], align 8 -; CHECK-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC1]] -; CHECK-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[CMP_N1:%.*]] = icmp eq i64 [[N]], [[N_VEC1]] -; CHECK-NEXT: br i1 [[CMP_N1]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] -; CHECK: [[VEC_EPILOG_ITER_CHECK]]: -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF1]], 4 -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] -; CHECK: [[VEC_EPILOG_PH]]: -; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC1]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] -; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]] ; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP5]], i64 0 @@ -81,15 +27,15 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) ; CHECK-NEXT: store <4 x i64> [[TMP4]], ptr [[TMP3]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 1 ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP14]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] -; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: +; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] -; CHECK: [[VEC_EPILOG_SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC1]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[DATA_2:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]] ; CHECK-NEXT: [[L_2:%.*]] = load i64, ptr [[DATA_2]], align 8 ; CHECK-NEXT: [[DATA_0:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 0 @@ -110,7 +56,7 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) ; CHECK-NEXT: store i64 [[MUL_3]], ptr [[DATA_3]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -171,7 +117,7 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) ; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -194,7 +140,7 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) ; CHECK-NEXT: store i64 [[MUL_1]], ptr [[DATA_1]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -249,7 +195,7 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no ; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -272,7 +218,7 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no ; CHECK-NEXT: store i64 [[MUL_1]], ptr [[DATA_1]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -327,7 +273,7 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa ; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -350,7 +296,7 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa ; CHECK-NEXT: store i64 [[MUL_0]], ptr [[DATA_1]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -405,7 +351,7 @@ define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %fa ; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -428,7 +374,7 @@ define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %fa ; CHECK-NEXT: store i64 [[MUL_1]], ptr [[DATA_0]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -489,7 +435,7 @@ define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noal ; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP16]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: @@ -513,7 +459,7 @@ define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noal ; CHECK-NEXT: store i64 [[MUL_1]], ptr [[DATA_1]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -573,7 +519,7 @@ define void @test_3xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) ; CHECK-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -598,7 +544,7 @@ define void @test_3xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) ; CHECK-NEXT: store i64 [[MUL_2]], ptr [[DATA_2]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -707,7 +653,7 @@ define void @test_3xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n) ; CHECK-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8 ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: @@ -731,7 +677,7 @@ define void @test_3xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n) ; CHECK-NEXT: store i32 [[MUL_2]], ptr [[DATA_2]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -765,20 +711,19 @@ exit: ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} -; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]} -; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]} -; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]} -; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]} -; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META1]]} -; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]} -; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META2]], [[META1]]} -; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]]} -; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META2]], [[META1]]} -; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]], [[META2]]} -; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META2]], [[META1]]} -; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]], [[META2]]} -; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META2]], [[META1]]} -; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]} -; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META2]], [[META1]]} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} +; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} +; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} +; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]} +; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]} +; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]} +; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll b/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll new file mode 100644 index 0000000..01934b1 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll @@ -0,0 +1,30 @@ +; RUN: opt -passes=loop-vectorize -pass-remarks-analysis=loop-vectorize -S < %s 2>&1 | FileCheck %s + +; Make sure the unsafe user specified interleave count is ignored. + +; CHECK: remark: <unknown>:0:0: Ignoring user-specified interleave count due to possibly unsafe dependencies in the loop. +; CHECK-LABEL: @loop_distance_4 +define void @loop_distance_4(ptr %a, ptr %b) { +entry: + br label %loop + +loop: + %iv = phi i64 [ 4, %entry ], [ %iv.next, %loop ] + %0 = getelementptr i32, ptr %b, i64 %iv + %arrayidx = getelementptr i8, ptr %0, i64 -16 + %1 = load i32, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds nuw i32, ptr %a, i64 %iv + %2 = load i32, ptr %arrayidx2, align 4 + %add = add nsw i32 %2, %1 + store i32 %add, ptr %0, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 64 + br i1 %exitcond.not, label %for.end, label %loop, !llvm.loop !1 + +for.end: + ret void +} + +!1 = !{!1, !2, !3} +!2 = !{!"llvm.loop.interleave.count", i32 4} +!3 = !{!"llvm.loop.vectorize.width", i32 4} diff --git a/llvm/test/Transforms/SCCP/conditions-ranges.ll b/llvm/test/Transforms/SCCP/conditions-ranges.ll index a3cf23b..f793814 100644 --- a/llvm/test/Transforms/SCCP/conditions-ranges.ll +++ b/llvm/test/Transforms/SCCP/conditions-ranges.ll @@ -1547,3 +1547,28 @@ bb2: call void @use(i1 %c4) ret void } + +define i1 @and_predicate_dominating_phi(i32 %x) { +; CHECK-LABEL: @and_predicate_dominating_phi( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[XGE1:%.*]] = icmp uge i32 [[X:%.*]], 1 +; CHECK-NEXT: [[XLT2:%.*]] = icmp ult i32 [[X]], 2 +; CHECK-NEXT: [[AND:%.*]] = and i1 [[XGE1]], [[XLT2]] +; CHECK-NEXT: br i1 [[AND]], label [[PHI:%.*]], label [[NOPE:%.*]] +; CHECK: nope: +; CHECK-NEXT: br label [[PHI]] +; CHECK: phi: +; CHECK-NEXT: ret i1 true +; +entry: + %xge1 = icmp uge i32 %x, 1 + %xlt2 = icmp ult i32 %x, 2 + %and = and i1 %xge1, %xlt2 + br i1 %and, label %phi, label %nope +nope: + br label %phi +phi: + %res = phi i32 [ %x, %entry ], [ 1, %nope ] + %ret = icmp uge i32 %res, 1 + ret i1 %ret +} diff --git a/llvm/test/Transforms/SimplifyCFG/AArch64/prefer-fma.ll b/llvm/test/Transforms/SimplifyCFG/AArch64/prefer-fma.ll index 0f18dc2..46e38d9 100644 --- a/llvm/test/Transforms/SimplifyCFG/AArch64/prefer-fma.ll +++ b/llvm/test/Transforms/SimplifyCFG/AArch64/prefer-fma.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -mtriple=aarch64-linux-gnu -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -hoist-common-insts=true -enable-unsafe-fp-math -S >%t +; RUN: opt < %s -mtriple=aarch64-linux-gnu -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -hoist-common-insts=true -S >%t ; RUN: FileCheck %s < %t ; ModuleID = 't.cc' diff --git a/llvm/test/Transforms/SimplifyCFG/PowerPC/prefer-fma.ll b/llvm/test/Transforms/SimplifyCFG/PowerPC/prefer-fma.ll index c7bc43e1..b61d659 100644 --- a/llvm/test/Transforms/SimplifyCFG/PowerPC/prefer-fma.ll +++ b/llvm/test/Transforms/SimplifyCFG/PowerPC/prefer-fma.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=powerpc64le-unknown-linux-gnu -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -hoist-common-insts=true -enable-unsafe-fp-math -S | \ +; RUN: opt < %s -mtriple=powerpc64le-unknown-linux-gnu -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -hoist-common-insts=true -S | \ ; RUN: FileCheck %s ; This case is copied from test/Transforms/SimplifyCFG/AArch64/ diff --git a/llvm/test/Transforms/Util/PredicateInfo/testandor.ll b/llvm/test/Transforms/Util/PredicateInfo/testandor.ll index 2e96a92..cc1dc4e 100644 --- a/llvm/test/Transforms/Util/PredicateInfo/testandor.ll +++ b/llvm/test/Transforms/Util/PredicateInfo/testandor.ll @@ -994,3 +994,30 @@ define void @test_assume_deep_and_tree(i1 %a1) { call void @foo(i1 %a15) ret void } + +define i32 @test_and_with_phinode(i32 %x) { +; CHECK-LABEL: @test_and_with_phinode( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[XGE1:%.*]] = icmp uge i32 [[X:%.*]], 1 +; CHECK-NEXT: [[XLT2:%.*]] = icmp ult i32 [[X]], 2 +; CHECK-NEXT: [[AND:%.*]] = and i1 [[XGE1]], [[XLT2]] +; CHECK: [[X_0_1:%.*]] = bitcast i32 [[X]] to i32 +; CHECK: [[X_0_2:%.*]] = bitcast i32 [[X_0_1]] to i32 +; CHECK-NEXT: br i1 [[AND]], label [[PHI:%.*]], label [[NOPE:%.*]] +; CHECK: nope: +; CHECK-NEXT: br label [[PHI]] +; CHECK: phi: +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[X_0_2]], [[ENTRY:%.*]] ], [ 1, [[NOPE]] ] +; CHECK-NEXT: ret i32 [[RES]] +; +entry: + %xge1 = icmp uge i32 %x, 1 + %xlt2 = icmp ult i32 %x, 2 + %and = and i1 %xge1, %xlt2 + br i1 %and, label %phi, label %nope +nope: + br label %phi +phi: + %res = phi i32 [ %x, %entry ], [ 1, %nope ] + ret i32 %res +} diff --git a/llvm/test/Transforms/WholeProgramDevirt/speculative-devirt-single-impl.ll b/llvm/test/Transforms/WholeProgramDevirt/speculative-devirt-single-impl.ll new file mode 100644 index 0000000..10566ae --- /dev/null +++ b/llvm/test/Transforms/WholeProgramDevirt/speculative-devirt-single-impl.ll @@ -0,0 +1,132 @@ +; -stats requires asserts +; REQUIRES: asserts + +; Check that we can still devirtualize outside LTO mode when speculative devirtualization is enabled. +; Check that we skip devirtualization for empty functions in speculative devirtualization mode + +; RUN: opt -S -passes=wholeprogramdevirt -devirtualize-speculatively \ +; RUN: -pass-remarks=wholeprogramdevirt -stats %s 2>&1 | FileCheck %s + +target datalayout = "e-p:64:64" +target triple = "x86_64-unknown-linux-gnu" + +; CHECK: remark: devirt-single.cc:30:32: single-impl: devirtualized a call to vf +; CHECK: remark: devirt-single.cc:41:32: single-impl: devirtualized a call to vf +; CHECK: remark: devirt-single.cc:51:32: single-impl: devirtualized a call to vf +; CHECK: remark: devirt-single.cc:13:0: devirtualized vf +; CHECK-NOT: devirtualized + +@vt1 = constant [1 x ptr] [ptr @vf], !type !8 +@vt2 = constant [1 x ptr] [ptr @vf_empty], !type !12 + +define i1 @vf(ptr %this) #0 !dbg !7 { + ret i1 true +} + +; This should NOT be devirtualized because during non-lto empty functions +; are skipped. +define void @vf_empty(ptr %this) !dbg !11 { + ret void +} + +; CHECK: define void @call +define void @call(ptr %obj) #1 !dbg !5 { + %vtable = load ptr, ptr %obj + %p = call i1 @llvm.public.type.test(ptr %vtable, metadata !"typeid") + call void @llvm.assume(i1 %p) + %fptr = load ptr, ptr %vtable + ; CHECK: if.true.direct_targ: + ; CHECK: call i1 @vf( + ; CHECK: if.false.orig_indirect: + ; CHECK: call i1 %fptr( + call i1 %fptr(ptr %obj), !dbg !6 + ret void +} + + +; CHECK: define void @call1 +define void @call1(ptr %obj) #1 !dbg !9 { + %vtable = load ptr, ptr %obj + %p = call i1 @llvm.type.test(ptr %vtable, metadata !"typeid1") + call void @llvm.assume(i1 %p) + %fptr = load ptr, ptr %vtable, align 8 + ; CHECK: call i1 %fptr + %1 = call i1 %fptr(ptr %obj), !dbg !10 + ret void +} +declare ptr @llvm.load.relative.i32(ptr, i32) + +@vt3 = private unnamed_addr constant [1 x i32] [ + i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @vf to i64), i64 ptrtoint (ptr @vt3 to i64)) to i32) +], align 4, !type !15 + +; CHECK: define void @call2 +define void @call2(ptr %obj) #1 !dbg !13 { + %vtable = load ptr, ptr %obj + %p = call i1 @llvm.type.test(ptr %vtable, metadata !"typeid2") + call void @llvm.assume(i1 %p) + %fptr = call ptr @llvm.load.relative.i32(ptr %vtable, i32 0) + ; CHECK: if.true.direct_targ: + ; CHECK: call i1 @vf( + ; CHECK: if.false.orig_indirect: + ; CHECK: call i1 %fptr( + call i1 %fptr(ptr %obj), !dbg !14 + ret void +} + +@_ZTV1A.local = private unnamed_addr constant { [3 x i32] } { [3 x i32] [ + i32 0, ; offset to top + i32 0, ; rtti + i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @vf to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [3 x i32] }, ptr @_ZTV1A.local, i32 0, i32 0, i32 2) to i64)) to i32) ; vf_emptyunc offset +] }, align 4, !type !18 + +; CHECK: define void @call3 +define void @call3(ptr %obj) #1 !dbg !16 { + %vtable = load ptr, ptr %obj + %p = call i1 @llvm.type.test(ptr %vtable, metadata !"typeid3") + call void @llvm.assume(i1 %p) + %fptr = call ptr @llvm.load.relative.i32(ptr %vtable, i32 8) + ; CHECK: if.true.direct_targ: + ; CHECK: call i1 @vf( + ; CHECK: if.false.orig_indirect: + ; CHECK: call i1 %fptr( + call i1 %fptr(ptr %obj), !dbg !17 + ret void +} + + +declare i1 @llvm.type.test(ptr, metadata) +declare i1 @llvm.public.type.test(ptr, metadata) +declare void @llvm.assume(i1) + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 4.0.0 (trunk 278098)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug) +!1 = !DIFile(filename: "devirt-single.cc", directory: ".") +!2 = !{i32 2, !"Dwarf Version", i32 4} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{!"clang version 4.0.0 (trunk 278098)"} +!5 = distinct !DISubprogram(name: "call", linkageName: "_Z4callPv", scope: !1, file: !1, line: 29, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: false, unit: !0) +!6 = !DILocation(line: 30, column: 32, scope: !5) +!7 = distinct !DISubprogram(name: "vf", linkageName: "_ZN3vt12vfEv", scope: !1, file: !1, line: 13, isLocal: false, isDefinition: true, scopeLine: 13, flags: DIFlagPrototyped, isOptimized: false, unit: !0) +!8 = !{i32 0, !"typeid"} + +!9 = distinct !DISubprogram(name: "call1", linkageName: "_Z5call1Pv", scope: !1, file: !1, line: 31, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: false, unit: !0) +!10 = !DILocation(line: 35, column: 32, scope: !9) +!11 = distinct !DISubprogram(name: "vf_empty", linkageName: "_ZN3vt18vf_emptyEv", scope: !1, file: !1, line: 23, isLocal: false, isDefinition: true, scopeLine: 23, flags: DIFlagPrototyped, isOptimized: false, unit: !0) +!12 = !{i32 0, !"typeid1"} + +!13 = distinct !DISubprogram(name: "call2", linkageName: "_Z5call2Pv", scope: !1, file: !1, line: 40, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: false, unit: !0) +!14 = !DILocation(line: 41, column: 32, scope: !13) +!15 = !{i32 0, !"typeid2"} + +!16 = distinct !DISubprogram(name: "call3", linkageName: "_Z5call3Pv", scope: !1, file: !1, line: 50, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: false, unit: !0) +!17 = !DILocation(line: 51, column: 32, scope: !16) +!18 = !{i32 0, !"typeid3"} + + + +; CHECK: 1 wholeprogramdevirt - Number of whole program devirtualization targets +; CHECK: 3 wholeprogramdevirt - Number of single implementation devirtualizations diff --git a/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll b/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll index d8f5c91..8327e1c 100644 --- a/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll +++ b/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll @@ -11,6 +11,9 @@ ; Check wildcard ; RUN: opt -S -passes=wholeprogramdevirt -whole-program-visibility -pass-remarks=wholeprogramdevirt -wholeprogramdevirt-skip=vf?i1 %s 2>&1 | FileCheck %s --check-prefix=SKIP +; Check that no stats are reported in speculative devirtualization mode as the virtual const prop is disabled. +; RUN: opt -S -passes=wholeprogramdevirt -devirtualize-speculatively -stats %s 2>&1 | FileCheck %s --check-prefix=CHECK-SPECULATIVE-WPD + target datalayout = "e-p:64:64" target triple = "x86_64-unknown-linux-gnu" @@ -225,3 +228,7 @@ declare ptr @llvm.load.relative.i32(ptr, i32) ; CHECK: 2 wholeprogramdevirt - Number of unique return value optimizations ; CHECK: 2 wholeprogramdevirt - Number of virtual constant propagations ; CHECK: 2 wholeprogramdevirt - Number of 1 bit virtual constant propagations + +; CHECK-SPECULATIVE-WPD-NOT: 0 wholeprogramdevirt - Number of unique return value optimizations +; CHECK-SPECULATIVE-WPD-NOT: 0 wholeprogramdevirt - Number of virtual constant propagations +; CHECK-SPECULATIVE-WPD-NOT: 0 wholeprogramdevirt - Number of 1 bit virtual constant propagations diff --git a/llvm/test/tools/llvm-ir2vec/embeddings-flowaware.ll b/llvm/test/tools/llvm-ir2vec/embeddings-flowaware.ll index b2362f8..ade228d 100644 --- a/llvm/test/tools/llvm-ir2vec/embeddings-flowaware.ll +++ b/llvm/test/tools/llvm-ir2vec/embeddings-flowaware.ll @@ -49,7 +49,7 @@ entry: ; CHECK-FUNC-LEVEL-ABC: Function: abc ; CHECK-FUNC-LEVEL-NEXT-ABC: [ 3630.00 3672.00 3714.00 ] -; CHECK-FUNC-DEF: Error: Function 'def' not found +; CHECK-FUNC-DEF: error: Function 'def' not found ; CHECK-BB-LEVEL: Function: abc ; CHECK-BB-LEVEL-NEXT: entry: [ 3630.00 3672.00 3714.00 ] diff --git a/llvm/test/tools/llvm-ir2vec/embeddings-symbolic.ll b/llvm/test/tools/llvm-ir2vec/embeddings-symbolic.ll index f9aa108..9d60e12 100644 --- a/llvm/test/tools/llvm-ir2vec/embeddings-symbolic.ll +++ b/llvm/test/tools/llvm-ir2vec/embeddings-symbolic.ll @@ -49,7 +49,7 @@ entry: ; CHECK-FUNC-LEVEL-ABC: Function: abc ; CHECK-FUNC-LEVEL-NEXT-ABC: [ 878.00 889.00 900.00 ] -; CHECK-FUNC-DEF: Error: Function 'def' not found +; CHECK-FUNC-DEF: error: Function 'def' not found ; CHECK-BB-LEVEL: Function: abc ; CHECK-BB-LEVEL-NEXT: entry: [ 878.00 889.00 900.00 ] diff --git a/llvm/test/tools/llvm-ir2vec/embeddings-symbolic.mir b/llvm/test/tools/llvm-ir2vec/embeddings-symbolic.mir new file mode 100644 index 0000000..ef835fe --- /dev/null +++ b/llvm/test/tools/llvm-ir2vec/embeddings-symbolic.mir @@ -0,0 +1,92 @@ +# REQUIRES: x86_64-linux +# RUN: llvm-ir2vec embeddings --mode=mir --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json %s | FileCheck %s -check-prefix=CHECK-DEFAULT +# RUN: llvm-ir2vec embeddings --mode=mir --level=func --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json %s | FileCheck %s -check-prefix=CHECK-FUNC-LEVEL +# RUN: llvm-ir2vec embeddings --mode=mir --level=func --function=add_function --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json %s | FileCheck %s -check-prefix=CHECK-FUNC-LEVEL-ADD +# RUN: not llvm-ir2vec embeddings --mode=mir --level=func --function=missing_function --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json %s 2>&1 | FileCheck %s -check-prefix=CHECK-FUNC-MISSING +# RUN: llvm-ir2vec embeddings --mode=mir --level=bb --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json %s | FileCheck %s -check-prefix=CHECK-BB-LEVEL +# RUN: llvm-ir2vec embeddings --mode=mir --level=inst --function=add_function --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json %s | FileCheck %s -check-prefix=CHECK-INST-LEVEL + +--- | + target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" + target triple = "x86_64-unknown-linux-gnu" + + define dso_local noundef i32 @add_function(i32 noundef %a, i32 noundef %b) { + entry: + %sum = add nsw i32 %a, %b + %result = mul nsw i32 %sum, 2 + ret i32 %result + } + + define dso_local void @simple_function() { + entry: + ret void + } +... +--- +name: add_function +alignment: 16 +tracksRegLiveness: true +registers: + - { id: 0, class: gr32 } + - { id: 1, class: gr32 } + - { id: 2, class: gr32 } + - { id: 3, class: gr32 } +liveins: + - { reg: '$edi', virtual-reg: '%0' } + - { reg: '$esi', virtual-reg: '%1' } +body: | + bb.0.entry: + liveins: $edi, $esi + + %1:gr32 = COPY $esi + %0:gr32 = COPY $edi + %2:gr32 = nsw ADD32rr %0, %1, implicit-def dead $eflags + %3:gr32 = ADD32rr %2, %2, implicit-def dead $eflags + $eax = COPY %3 + RET 0, $eax + +--- +name: simple_function +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry: + RET 0 + +# CHECK-DEFAULT: MIR2Vec embeddings for machine function add_function: +# CHECK-DEFAULT-NEXT: Function vector: [ 26.50 27.10 27.70 ] +# CHECK-DEFAULT: MIR2Vec embeddings for machine function simple_function: +# CHECK-DEFAULT-NEXT: Function vector: [ 1.10 1.20 1.30 ] + +# CHECK-FUNC-LEVEL: MIR2Vec embeddings for machine function add_function: +# CHECK-FUNC-LEVEL-NEXT: Function vector: [ 26.50 27.10 27.70 ] +# CHECK-FUNC-LEVEL: MIR2Vec embeddings for machine function simple_function: +# CHECK-FUNC-LEVEL-NEXT: Function vector: [ 1.10 1.20 1.30 ] + +# CHECK-FUNC-LEVEL-ADD: MIR2Vec embeddings for machine function add_function: +# CHECK-FUNC-LEVEL-ADD-NEXT: Function vector: [ 26.50 27.10 27.70 ] +# CHECK-FUNC-LEVEL-ADD-NOT: simple_function + +# CHECK-FUNC-MISSING: error: Function 'missing_function' not found + +# CHECK-BB-LEVEL: MIR2Vec embeddings for machine function add_function: +# CHECK-BB-LEVEL-NEXT: Basic block vectors: +# CHECK-BB-LEVEL-NEXT: MBB entry: [ 26.50 27.10 27.70 ] +# CHECK-BB-LEVEL: MIR2Vec embeddings for machine function simple_function: +# CHECK-BB-LEVEL-NEXT: Basic block vectors: +# CHECK-BB-LEVEL-NEXT: MBB entry: [ 1.10 1.20 1.30 ] + +# CHECK-INST-LEVEL: MIR2Vec embeddings for machine function add_function: +# CHECK-INST-LEVEL-NEXT: Instruction vectors: +# CHECK-INST-LEVEL: %1:gr32 = COPY $esi +# CHECK-INST-LEVEL-NEXT: -> [ 6.00 6.10 6.20 ] +# CHECK-INST-LEVEL-NEXT: %0:gr32 = COPY $edi +# CHECK-INST-LEVEL-NEXT: -> [ 6.00 6.10 6.20 ] +# CHECK-INST-LEVEL: %2:gr32 = nsw ADD32rr +# CHECK-INST-LEVEL: -> [ 3.70 3.80 3.90 ] +# CHECK-INST-LEVEL: %3:gr32 = ADD32rr +# CHECK-INST-LEVEL: -> [ 3.70 3.80 3.90 ] +# CHECK-INST-LEVEL: $eax = COPY %3:gr32 +# CHECK-INST-LEVEL-NEXT: -> [ 6.00 6.10 6.20 ] +# CHECK-INST-LEVEL: RET 0, $eax +# CHECK-INST-LEVEL-NEXT: -> [ 1.10 1.20 1.30 ] diff --git a/llvm/test/tools/llvm-ir2vec/error-handling.ll b/llvm/test/tools/llvm-ir2vec/error-handling.ll index b944ea0..8e9e455 100644 --- a/llvm/test/tools/llvm-ir2vec/error-handling.ll +++ b/llvm/test/tools/llvm-ir2vec/error-handling.ll @@ -10,4 +10,4 @@ entry: } ; CHECK-NO-VOCAB: error: IR2Vec vocabulary file path not specified; You may need to set it using --ir2vec-vocab-path -; CHECK-FUNC-NOT-FOUND: Error: Function 'nonexistent' not found +; CHECK-FUNC-NOT-FOUND: error: Function 'nonexistent' not found diff --git a/llvm/test/tools/llvm-ir2vec/error-handling.mir b/llvm/test/tools/llvm-ir2vec/error-handling.mir new file mode 100644 index 0000000..caec454c --- /dev/null +++ b/llvm/test/tools/llvm-ir2vec/error-handling.mir @@ -0,0 +1,41 @@ +# REQUIRES: x86_64-linux +# Test error handling and input validation for llvm-ir2vec tool in MIR mode + +# RUN: not llvm-ir2vec embeddings --mode=mir %s 2>&1 | FileCheck %s -check-prefix=CHECK-NO-VOCAB +# RUN: not llvm-ir2vec embeddings --mode=mir --mir2vec-vocab-path=%S/nonexistent-vocab.json %s 2>&1 | FileCheck %s -check-prefix=CHECK-VOCAB-NOT-FOUND +# RUN: not llvm-ir2vec embeddings --mode=mir --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_invalid_vocab.json %s 2>&1 | FileCheck %s -check-prefix=CHECK-INVALID-VOCAB +# RUN: not llvm-ir2vec embeddings --mode=mir --function=nonexistent_function --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json %s 2>&1 | FileCheck %s -check-prefix=CHECK-FUNC-NOT-FOUND + +--- | + target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" + target triple = "x86_64-unknown-linux-gnu" + + define dso_local noundef i32 @test_function(i32 noundef %a) { + entry: + ret i32 %a + } +... +--- +name: test_function +alignment: 16 +tracksRegLiveness: true +registers: + - { id: 0, class: gr32 } +liveins: + - { reg: '$edi', virtual-reg: '%0' } +body: | + bb.0.entry: + liveins: $edi + + %0:gr32 = COPY $edi + $eax = COPY %0 + RET 0, $eax + +# CHECK-NO-VOCAB: error: Failed to load MIR2Vec vocabulary - MIR2Vec vocabulary file path not specified; set it using --mir2vec-vocab-path + +# CHECK-VOCAB-NOT-FOUND: error: Failed to load MIR2Vec vocabulary +# CHECK-VOCAB-NOT-FOUND: No such file or directory + +# CHECK-INVALID-VOCAB: error: Failed to load MIR2Vec vocabulary - Missing 'Opcodes' section in vocabulary file + +# CHECK-FUNC-NOT-FOUND: error: Function 'nonexistent_function' not found diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx1.s index 1ffe533..d1df304 100644 --- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx1.s +++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx1.s @@ -1403,8 +1403,8 @@ vzeroupper # CHECK-NEXT: 1 8 0.50 * vpblendvb %xmm3, (%rax), %xmm1, %xmm2 # CHECK-NEXT: 1 1 0.25 vpblendw $11, %xmm0, %xmm1, %xmm2 # CHECK-NEXT: 1 8 0.50 * vpblendw $11, (%rax), %xmm1, %xmm2 -# CHECK-NEXT: 4 4 2.00 vpclmulqdq $11, %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: 4 11 2.00 * vpclmulqdq $11, (%rax), %xmm1, %xmm2 +# CHECK-NEXT: 4 4 1.50 vpclmulqdq $11, %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: 4 11 1.50 * vpclmulqdq $11, (%rax), %xmm1, %xmm2 # CHECK-NEXT: 1 1 0.25 vpcmpeqb %xmm0, %xmm1, %xmm2 # CHECK-NEXT: 1 8 0.50 * vpcmpeqb (%rax), %xmm1, %xmm2 # CHECK-NEXT: 1 1 0.25 vpcmpeqd %xmm0, %xmm1, %xmm2 @@ -1415,8 +1415,8 @@ vzeroupper # CHECK-NEXT: 1 8 0.50 * vpcmpeqw (%rax), %xmm1, %xmm2 # CHECK-NEXT: 8 6 3.00 vpcmpestri $1, %xmm0, %xmm2 # CHECK-NEXT: 12 13 3.00 * vpcmpestri $1, (%rax), %xmm2 -# CHECK-NEXT: 7 6 3.00 vpcmpestrm $1, %xmm0, %xmm2 -# CHECK-NEXT: 12 13 3.00 * vpcmpestrm $1, (%rax), %xmm2 +# CHECK-NEXT: 7 7 3.00 vpcmpestrm $1, %xmm0, %xmm2 +# CHECK-NEXT: 12 14 3.00 * vpcmpestrm $1, (%rax), %xmm2 # CHECK-NEXT: 1 1 0.25 vpcmpgtb %xmm0, %xmm1, %xmm2 # CHECK-NEXT: 1 8 0.50 * vpcmpgtb (%rax), %xmm1, %xmm2 # CHECK-NEXT: 1 1 0.25 vpcmpgtd %xmm0, %xmm1, %xmm2 @@ -1427,8 +1427,8 @@ vzeroupper # CHECK-NEXT: 1 8 0.50 * vpcmpgtw (%rax), %xmm1, %xmm2 # CHECK-NEXT: 4 2 2.00 vpcmpistri $1, %xmm0, %xmm2 # CHECK-NEXT: 4 9 2.00 * vpcmpistri $1, (%rax), %xmm2 -# CHECK-NEXT: 3 6 2.00 vpcmpistrm $1, %xmm0, %xmm2 -# CHECK-NEXT: 4 13 2.00 * vpcmpistrm $1, (%rax), %xmm2 +# CHECK-NEXT: 3 7 2.00 vpcmpistrm $1, %xmm0, %xmm2 +# CHECK-NEXT: 4 14 2.00 * vpcmpistrm $1, (%rax), %xmm2 # CHECK-NEXT: 1 3 1.00 vperm2f128 $1, %ymm0, %ymm1, %ymm2 # CHECK-NEXT: 1 10 1.00 * vperm2f128 $1, (%rax), %ymm1, %ymm2 # CHECK-NEXT: 1 1 0.50 vpermilpd $1, %xmm0, %xmm2 @@ -1749,7 +1749,7 @@ vzeroupper # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] -# CHECK-NEXT: 1.33 1.33 1.33 16.50 16.50 16.50 16.50 - 205.25 393.58 268.08 158.08 208.50 208.50 65.00 119.67 119.67 119.67 107.00 107.00 107.00 19.00 19.00 +# CHECK-NEXT: 1.33 1.33 1.33 16.50 16.50 16.50 16.50 - 204.25 392.58 268.08 158.08 208.50 208.50 65.00 119.67 119.67 119.67 107.00 107.00 107.00 19.00 19.00 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions: @@ -2126,8 +2126,8 @@ vzeroupper # CHECK-NEXT: - - - - - - - - 0.50 - - 0.50 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpblendvb %xmm3, (%rax), %xmm1, %xmm2 # CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 - - - - - - - - - - - vpblendw $11, %xmm0, %xmm1, %xmm2 # CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpblendw $11, (%rax), %xmm1, %xmm2 -# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - - - - - - - - - - - - vpclmulqdq $11, %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpclmulqdq $11, (%rax), %xmm1, %xmm2 +# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - - - - - - - - - - - - vpclmulqdq $11, %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpclmulqdq $11, (%rax), %xmm1, %xmm2 # CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 - - - - - - - - - - - vpcmpeqb %xmm0, %xmm1, %xmm2 # CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpcmpeqb (%rax), %xmm1, %xmm2 # CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 - - - - - - - - - - - vpcmpeqd %xmm0, %xmm1, %xmm2 diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx2.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx2.s index 6dc5bac..6c8fac4 100644 --- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx2.s +++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx2.s @@ -560,14 +560,14 @@ vpxor (%rax), %ymm1, %ymm2 # CHECK-NEXT: 1 8 0.50 * vpcmpgtw (%rax), %ymm1, %ymm2 # CHECK-NEXT: 1 3 1.00 vperm2i128 $1, %ymm0, %ymm1, %ymm2 # CHECK-NEXT: 1 8 1.00 * vperm2i128 $1, (%rax), %ymm1, %ymm2 -# CHECK-NEXT: 2 5 1.00 vpermd %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: 2 12 2.00 * vpermd (%rax), %ymm1, %ymm2 -# CHECK-NEXT: 2 6 1.00 vpermpd $1, %ymm0, %ymm2 -# CHECK-NEXT: 3 13 2.00 * vpermpd $1, (%rax), %ymm2 -# CHECK-NEXT: 2 7 1.00 vpermps %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: 3 14 2.00 * vpermps (%rax), %ymm1, %ymm2 -# CHECK-NEXT: 2 6 1.00 vpermq $1, %ymm0, %ymm2 -# CHECK-NEXT: 2 12 2.00 * vpermq $1, (%rax), %ymm2 +# CHECK-NEXT: 1 4 1.00 vpermd %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: 1 11 1.00 * vpermd (%rax), %ymm1, %ymm2 +# CHECK-NEXT: 1 4 1.00 vpermpd $1, %ymm0, %ymm2 +# CHECK-NEXT: 1 11 1.00 * vpermpd $1, (%rax), %ymm2 +# CHECK-NEXT: 1 4 1.00 vpermps %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: 1 11 1.00 * vpermps (%rax), %ymm1, %ymm2 +# CHECK-NEXT: 1 4 1.00 vpermq $1, %ymm0, %ymm2 +# CHECK-NEXT: 1 11 1.00 * vpermq $1, (%rax), %ymm2 # CHECK-NEXT: 1 5 0.33 * vpgatherdd %xmm0, (%rax,%xmm1,2), %xmm2 # CHECK-NEXT: 1 5 0.33 * vpgatherdd %ymm0, (%rax,%ymm1,2), %ymm2 # CHECK-NEXT: 1 5 0.33 * vpgatherdq %xmm0, (%rax,%xmm1,2), %xmm2 @@ -789,7 +789,7 @@ vpxor (%rax), %ymm1, %ymm2 # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] -# CHECK-NEXT: 6.67 6.67 6.67 - - - - - 93.75 132.75 92.25 36.25 80.50 80.50 29.00 52.33 52.33 52.33 50.67 50.67 50.67 2.50 2.50 +# CHECK-NEXT: 6.67 6.67 6.67 - - - - - 93.75 128.75 92.25 36.25 80.50 80.50 29.00 52.33 52.33 52.33 50.67 50.67 50.67 2.50 2.50 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions: @@ -894,13 +894,13 @@ vpxor (%rax), %ymm1, %ymm2 # CHECK-NEXT: - - - - - - - - - 1.00 - - - - - - - - - - - - - vperm2i128 $1, %ymm0, %ymm1, %ymm2 # CHECK-NEXT: - - - - - - - - - 1.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vperm2i128 $1, (%rax), %ymm1, %ymm2 # CHECK-NEXT: - - - - - - - - - 1.00 - - - - - - - - - - - - - vpermd %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - 2.00 - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpermd (%rax), %ymm1, %ymm2 +# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - 1.00 - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpermd (%rax), %ymm1, %ymm2 # CHECK-NEXT: - - - - - - - - - 1.00 - - - - - - - - - - - - - vpermpd $1, %ymm0, %ymm2 -# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - 2.00 - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpermpd $1, (%rax), %ymm2 +# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - 1.00 - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpermpd $1, (%rax), %ymm2 # CHECK-NEXT: - - - - - - - - - 1.00 - - - - - - - - - - - - - vpermps %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - 2.00 - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpermps (%rax), %ymm1, %ymm2 +# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - 1.00 - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpermps (%rax), %ymm1, %ymm2 # CHECK-NEXT: - - - - - - - - - 1.00 - - - - - - - - - - - - - vpermq $1, %ymm0, %ymm2 -# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - 2.00 - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpermq $1, (%rax), %ymm2 +# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - 1.00 - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpermq $1, (%rax), %ymm2 # CHECK-NEXT: 0.33 0.33 0.33 - - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpgatherdd %xmm0, (%rax,%xmm1,2), %xmm2 # CHECK-NEXT: 0.33 0.33 0.33 - - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpgatherdd %ymm0, (%rax,%ymm1,2), %ymm2 # CHECK-NEXT: 0.33 0.33 0.33 - - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpgatherdq %xmm0, (%rax,%xmm1,2), %xmm2 diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s index 72d7de3..14b8e5f 100644 --- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s +++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s @@ -1207,7 +1207,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: 1 3 1.00 vaddps %zmm16, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 1 10 1.00 * vaddps (%rax), %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 1 10 1.00 * vaddps (%rax){1to16}, %zmm17, %zmm19 {%k1} {z} -# CHECK-NEXT: 1 4 0.50 valignd $1, %zmm16, %zmm17, %zmm19 +# CHECK-NEXT: 1 2 0.50 valignd $1, %zmm16, %zmm17, %zmm19 # CHECK-NEXT: 1 8 1.00 * valignd $1, (%rax), %zmm17, %zmm19 # CHECK-NEXT: 1 8 1.00 * valignd $1, (%rax){1to16}, %zmm17, %zmm19 # CHECK-NEXT: 1 1 1.00 valignd $1, %zmm16, %zmm17, %zmm19 {%k1} @@ -1216,7 +1216,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: 1 1 1.00 valignd $1, %zmm16, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 1 8 1.00 * valignd $1, (%rax), %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 1 8 1.00 * valignd $1, (%rax){1to16}, %zmm17, %zmm19 {%k1} {z} -# CHECK-NEXT: 1 4 0.50 valignq $1, %zmm16, %zmm17, %zmm19 +# CHECK-NEXT: 1 2 0.50 valignq $1, %zmm16, %zmm17, %zmm19 # CHECK-NEXT: 1 8 1.00 * valignq $1, (%rax), %zmm17, %zmm19 # CHECK-NEXT: 1 8 1.00 * valignq $1, (%rax){1to8}, %zmm17, %zmm19 # CHECK-NEXT: 1 1 1.00 valignq $1, %zmm16, %zmm17, %zmm19 {%k1} diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vl.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vl.s index 552b3e4..ead609e 100644 --- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vl.s +++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vl.s @@ -1948,7 +1948,7 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK-NEXT: 1 3 0.50 vaddps %ymm16, %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: 1 10 0.50 * vaddps (%rax), %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: 1 10 0.50 * vaddps (%rax){1to8}, %ymm17, %ymm19 {%k1} {z} -# CHECK-NEXT: 1 4 0.50 valignd $1, %xmm16, %xmm17, %xmm19 +# CHECK-NEXT: 1 3 0.50 valignd $1, %xmm16, %xmm17, %xmm19 # CHECK-NEXT: 1 8 0.50 * valignd $1, (%rax), %xmm17, %xmm19 # CHECK-NEXT: 1 8 0.50 * valignd $1, (%rax){1to4}, %xmm17, %xmm19 # CHECK-NEXT: 1 1 0.50 valignd $1, %xmm16, %xmm17, %xmm19 {%k1} @@ -1957,7 +1957,7 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK-NEXT: 1 1 0.50 valignd $1, %xmm16, %xmm17, %xmm19 {%k1} {z} # CHECK-NEXT: 1 8 0.50 * valignd $1, (%rax), %xmm17, %xmm19 {%k1} {z} # CHECK-NEXT: 1 8 0.50 * valignd $1, (%rax){1to4}, %xmm17, %xmm19 {%k1} {z} -# CHECK-NEXT: 1 4 0.50 valignd $1, %ymm16, %ymm17, %ymm19 +# CHECK-NEXT: 1 4 1.00 valignd $1, %ymm16, %ymm17, %ymm19 # CHECK-NEXT: 1 8 0.50 * valignd $1, (%rax), %ymm17, %ymm19 # CHECK-NEXT: 1 8 0.50 * valignd $1, (%rax){1to8}, %ymm17, %ymm19 # CHECK-NEXT: 1 1 0.50 valignd $1, %ymm16, %ymm17, %ymm19 {%k1} @@ -1966,7 +1966,7 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK-NEXT: 1 1 0.50 valignd $1, %ymm16, %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: 1 8 0.50 * valignd $1, (%rax), %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: 1 8 0.50 * valignd $1, (%rax){1to8}, %ymm17, %ymm19 {%k1} {z} -# CHECK-NEXT: 1 4 0.50 valignq $1, %xmm16, %xmm17, %xmm19 +# CHECK-NEXT: 1 3 0.50 valignq $1, %xmm16, %xmm17, %xmm19 # CHECK-NEXT: 1 8 0.50 * valignq $1, (%rax), %xmm17, %xmm19 # CHECK-NEXT: 1 8 0.50 * valignq $1, (%rax){1to2}, %xmm17, %xmm19 # CHECK-NEXT: 1 1 0.50 valignq $1, %xmm16, %xmm17, %xmm19 {%k1} @@ -1975,7 +1975,7 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK-NEXT: 1 1 0.50 valignq $1, %xmm16, %xmm17, %xmm19 {%k1} {z} # CHECK-NEXT: 1 8 0.50 * valignq $1, (%rax), %xmm17, %xmm19 {%k1} {z} # CHECK-NEXT: 1 8 0.50 * valignq $1, (%rax){1to2}, %xmm17, %xmm19 {%k1} {z} -# CHECK-NEXT: 1 4 0.50 valignq $1, %ymm16, %ymm17, %ymm19 +# CHECK-NEXT: 1 4 1.00 valignq $1, %ymm16, %ymm17, %ymm19 # CHECK-NEXT: 1 8 0.50 * valignq $1, (%rax), %ymm17, %ymm19 # CHECK-NEXT: 1 8 0.50 * valignq $1, (%rax){1to4}, %ymm17, %ymm19 # CHECK-NEXT: 1 1 0.50 valignq $1, %ymm16, %ymm17, %ymm19 {%k1} @@ -3614,7 +3614,7 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] -# CHECK-NEXT: 10.67 10.67 10.67 - - - - - 208.00 1083.00 636.50 261.50 509.50 509.50 32.00 355.67 355.67 355.67 334.33 334.33 334.33 32.00 32.00 +# CHECK-NEXT: 10.67 10.67 10.67 - - - - - 208.00 1084.00 637.50 261.50 509.50 509.50 32.00 355.67 355.67 355.67 334.33 334.33 334.33 32.00 32.00 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions: @@ -3663,7 +3663,7 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - - - - - - - - - - valignd $1, %xmm16, %xmm17, %xmm19 {%k1} {z} # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - valignd $1, (%rax), %xmm17, %xmm19 {%k1} {z} # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - valignd $1, (%rax){1to4}, %xmm17, %xmm19 {%k1} {z} -# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - - - - - - - - - - valignd $1, %ymm16, %ymm17, %ymm19 +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - - - - - - - - - - valignd $1, %ymm16, %ymm17, %ymm19 # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - valignd $1, (%rax), %ymm17, %ymm19 # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - valignd $1, (%rax){1to8}, %ymm17, %ymm19 # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - - - - - - - - - - valignd $1, %ymm16, %ymm17, %ymm19 {%k1} @@ -3681,7 +3681,7 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - - - - - - - - - - valignq $1, %xmm16, %xmm17, %xmm19 {%k1} {z} # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - valignq $1, (%rax), %xmm17, %xmm19 {%k1} {z} # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - valignq $1, (%rax){1to2}, %xmm17, %xmm19 {%k1} {z} -# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - - - - - - - - - - valignq $1, %ymm16, %ymm17, %ymm19 +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - - - - - - - - - - valignq $1, %ymm16, %ymm17, %ymm19 # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - valignq $1, (%rax), %ymm17, %ymm19 # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - valignq $1, (%rax){1to4}, %ymm17, %ymm19 # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - - - - - - - - - - valignq $1, %ymm16, %ymm17, %ymm19 {%k1} diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdq.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdq.s index 87ba060..d1f2a98 100644 --- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdq.s +++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdq.s @@ -13,8 +13,8 @@ vpclmulqdq $11, (%rax), %zmm17, %zmm19 # CHECK-NEXT: [6]: HasSideEffects (U) # CHECK: [1] [2] [3] [4] [5] [6] Instructions: -# CHECK-NEXT: 4 4 2.00 vpclmulqdq $11, %zmm16, %zmm17, %zmm19 -# CHECK-NEXT: 4 11 2.00 * vpclmulqdq $11, (%rax), %zmm17, %zmm19 +# CHECK-NEXT: 4 4 1.50 vpclmulqdq $11, %zmm16, %zmm17, %zmm19 +# CHECK-NEXT: 4 11 1.50 * vpclmulqdq $11, (%rax), %zmm17, %zmm19 # CHECK: Resources: # CHECK-NEXT: [0] - Zn4AGU0 @@ -43,9 +43,9 @@ vpclmulqdq $11, (%rax), %zmm17, %zmm19 # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] -# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions: -# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - - - - - - - - - - - - vpclmulqdq $11, %zmm16, %zmm17, %zmm19 -# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpclmulqdq $11, (%rax), %zmm17, %zmm19 +# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - - - - - - - - - - - - vpclmulqdq $11, %zmm16, %zmm17, %zmm19 +# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpclmulqdq $11, (%rax), %zmm17, %zmm19 diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdqvl.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdqvl.s index 3c80c56..ea7a280 100644 --- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdqvl.s +++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdqvl.s @@ -16,10 +16,10 @@ vpclmulqdq $11, (%rax), %ymm17, %ymm19 # CHECK-NEXT: [6]: HasSideEffects (U) # CHECK: [1] [2] [3] [4] [5] [6] Instructions: -# CHECK-NEXT: 4 4 2.00 vpclmulqdq $11, %xmm16, %xmm17, %xmm19 -# CHECK-NEXT: 4 11 2.00 * vpclmulqdq $11, (%rax), %xmm17, %xmm19 -# CHECK-NEXT: 4 4 2.00 vpclmulqdq $11, %ymm16, %ymm17, %ymm19 -# CHECK-NEXT: 4 11 2.00 * vpclmulqdq $11, (%rax), %ymm17, %ymm19 +# CHECK-NEXT: 4 4 1.50 vpclmulqdq $11, %xmm16, %xmm17, %xmm19 +# CHECK-NEXT: 4 11 1.50 * vpclmulqdq $11, (%rax), %xmm17, %xmm19 +# CHECK-NEXT: 4 4 1.50 vpclmulqdq $11, %ymm16, %ymm17, %ymm19 +# CHECK-NEXT: 4 11 1.50 * vpclmulqdq $11, (%rax), %ymm17, %ymm19 # CHECK: Resources: # CHECK-NEXT: [0] - Zn4AGU0 @@ -48,11 +48,11 @@ vpclmulqdq $11, (%rax), %ymm17, %ymm19 # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] -# CHECK-NEXT: - - - - - - - - 8.00 8.00 - - 1.00 1.00 - 0.67 0.67 0.67 0.67 0.67 0.67 - - +# CHECK-NEXT: - - - - - - - - 6.00 6.00 - - 1.00 1.00 - 0.67 0.67 0.67 0.67 0.67 0.67 - - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions: -# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - - - - - - - - - - - - vpclmulqdq $11, %xmm16, %xmm17, %xmm19 -# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpclmulqdq $11, (%rax), %xmm17, %xmm19 -# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - - - - - - - - - - - - vpclmulqdq $11, %ymm16, %ymm17, %ymm19 -# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpclmulqdq $11, (%rax), %ymm17, %ymm19 +# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - - - - - - - - - - - - vpclmulqdq $11, %xmm16, %xmm17, %xmm19 +# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpclmulqdq $11, (%rax), %xmm17, %xmm19 +# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - - - - - - - - - - - - vpclmulqdq $11, %ymm16, %ymm17, %ymm19 +# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpclmulqdq $11, (%rax), %ymm17, %ymm19 diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-bmi1.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-bmi1.s index f4888cf..afbd566 100644 --- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-bmi1.s +++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-bmi1.s @@ -69,12 +69,12 @@ tzcnt (%rax), %rcx # CHECK-NEXT: 2 5 0.33 * blsrl (%rax), %ecx # CHECK-NEXT: 1 1 0.25 blsrq %rax, %rcx # CHECK-NEXT: 2 5 0.33 * blsrq (%rax), %rcx -# CHECK-NEXT: 2 2 1.00 tzcntw %ax, %cx -# CHECK-NEXT: 2 6 0.50 * tzcntw (%rax), %cx -# CHECK-NEXT: 2 2 0.50 tzcntl %eax, %ecx -# CHECK-NEXT: 2 6 0.50 * tzcntl (%rax), %ecx -# CHECK-NEXT: 2 2 0.50 tzcntq %rax, %rcx -# CHECK-NEXT: 2 6 0.50 * tzcntq (%rax), %rcx +# CHECK-NEXT: 1 1 0.25 tzcntw %ax, %cx +# CHECK-NEXT: 1 5 0.50 * tzcntw (%rax), %cx +# CHECK-NEXT: 1 1 0.50 tzcntl %eax, %ecx +# CHECK-NEXT: 1 5 0.50 * tzcntl (%rax), %ecx +# CHECK-NEXT: 1 1 0.50 tzcntq %rax, %rcx +# CHECK-NEXT: 1 5 0.50 * tzcntq (%rax), %rcx # CHECK: Resources: # CHECK-NEXT: [0] - Zn4AGU0 @@ -103,7 +103,7 @@ tzcnt (%rax), %rcx # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] -# CHECK-NEXT: 4.33 4.33 4.33 5.00 9.50 9.50 5.00 - - - - - - - - 4.33 4.33 4.33 4.33 4.33 4.33 - - +# CHECK-NEXT: 4.33 4.33 4.33 4.25 8.75 8.75 4.25 - - - - - - - - 4.33 4.33 4.33 4.33 4.33 4.33 - - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions: @@ -127,7 +127,7 @@ tzcnt (%rax), %rcx # CHECK-NEXT: 0.33 0.33 0.33 0.25 0.25 0.25 0.25 - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - blsrl (%rax), %ecx # CHECK-NEXT: - - - 0.25 0.25 0.25 0.25 - - - - - - - - - - - - - - - - blsrq %rax, %rcx # CHECK-NEXT: 0.33 0.33 0.33 0.25 0.25 0.25 0.25 - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - blsrq (%rax), %rcx -# CHECK-NEXT: - - - 1.00 1.00 1.00 1.00 - - - - - - - - - - - - - - - - tzcntw %ax, %cx +# CHECK-NEXT: - - - 0.25 0.25 0.25 0.25 - - - - - - - - - - - - - - - - tzcntw %ax, %cx # CHECK-NEXT: 0.33 0.33 0.33 - 0.50 0.50 - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - tzcntw (%rax), %cx # CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - - - - - - - - - - - tzcntl %eax, %ecx # CHECK-NEXT: 0.33 0.33 0.33 - 0.50 0.50 - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - tzcntl (%rax), %ecx diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-cmpxchg.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-cmpxchg.s index 64feeaf..26a42fd 100644 --- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-cmpxchg.s +++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-cmpxchg.s @@ -15,10 +15,10 @@ lock cmpxchg16b (%rax) # CHECK-NEXT: [6]: HasSideEffects (U) # CHECK: [1] [2] [3] [4] [5] [6] Instructions: -# CHECK-NEXT: 19 3 6.00 * * cmpxchg8b (%rax) -# CHECK-NEXT: 28 4 14.75 * * cmpxchg16b (%rax) -# CHECK-NEXT: 19 3 6.00 * * lock cmpxchg8b (%rax) -# CHECK-NEXT: 28 4 14.75 * * lock cmpxchg16b (%rax) +# CHECK-NEXT: 15 3 5.00 * * cmpxchg8b (%rax) +# CHECK-NEXT: 26 2 10.00 * * cmpxchg16b (%rax) +# CHECK-NEXT: 15 3 5.00 * * lock cmpxchg8b (%rax) +# CHECK-NEXT: 26 2 10.00 * * lock cmpxchg16b (%rax) # CHECK: Resources: # CHECK-NEXT: [0] - Zn4AGU0 @@ -47,11 +47,11 @@ lock cmpxchg16b (%rax) # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] -# CHECK-NEXT: - - - 41.50 41.50 41.50 41.50 - - - - - - - - - - - - - - - - +# CHECK-NEXT: - - - 30.00 30.00 30.00 30.00 - - - - - - - - - - - - - - - - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions: -# CHECK-NEXT: - - - 6.00 6.00 6.00 6.00 - - - - - - - - - - - - - - - - cmpxchg8b (%rax) -# CHECK-NEXT: - - - 14.75 14.75 14.75 14.75 - - - - - - - - - - - - - - - - cmpxchg16b (%rax) -# CHECK-NEXT: - - - 6.00 6.00 6.00 6.00 - - - - - - - - - - - - - - - - lock cmpxchg8b (%rax) -# CHECK-NEXT: - - - 14.75 14.75 14.75 14.75 - - - - - - - - - - - - - - - - lock cmpxchg16b (%rax) +# CHECK-NEXT: - - - 5.00 5.00 5.00 5.00 - - - - - - - - - - - - - - - - cmpxchg8b (%rax) +# CHECK-NEXT: - - - 10.00 10.00 10.00 10.00 - - - - - - - - - - - - - - - - cmpxchg16b (%rax) +# CHECK-NEXT: - - - 5.00 5.00 5.00 5.00 - - - - - - - - - - - - - - - - lock cmpxchg8b (%rax) +# CHECK-NEXT: - - - 10.00 10.00 10.00 10.00 - - - - - - - - - - - - - - - - lock cmpxchg16b (%rax) diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-pclmul.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-pclmul.s index a36fb2aa..fc2bc8e 100644 --- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-pclmul.s +++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-pclmul.s @@ -13,8 +13,8 @@ pclmulqdq $11, (%rax), %xmm2 # CHECK-NEXT: [6]: HasSideEffects (U) # CHECK: [1] [2] [3] [4] [5] [6] Instructions: -# CHECK-NEXT: 4 4 2.00 pclmulqdq $11, %xmm0, %xmm2 -# CHECK-NEXT: 4 11 2.00 * pclmulqdq $11, (%rax), %xmm2 +# CHECK-NEXT: 4 4 1.50 pclmulqdq $11, %xmm0, %xmm2 +# CHECK-NEXT: 4 11 1.50 * pclmulqdq $11, (%rax), %xmm2 # CHECK: Resources: # CHECK-NEXT: [0] - Zn4AGU0 @@ -43,9 +43,9 @@ pclmulqdq $11, (%rax), %xmm2 # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] -# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions: -# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - - - - - - - - - - - - pclmulqdq $11, %xmm0, %xmm2 -# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - pclmulqdq $11, (%rax), %xmm2 +# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - - - - - - - - - - - - pclmulqdq $11, %xmm0, %xmm2 +# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - pclmulqdq $11, (%rax), %xmm2 diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-sse42.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-sse42.s index 015d37e..ae60835 100644 --- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-sse42.s +++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-sse42.s @@ -52,12 +52,12 @@ pcmpgtq (%rax), %xmm2 # CHECK-NEXT: 1 7 1.00 * crc32q (%rax), %rcx # CHECK-NEXT: 8 6 3.00 pcmpestri $1, %xmm0, %xmm2 # CHECK-NEXT: 12 13 3.00 * pcmpestri $1, (%rax), %xmm2 -# CHECK-NEXT: 7 6 3.00 pcmpestrm $1, %xmm0, %xmm2 -# CHECK-NEXT: 12 13 3.00 * pcmpestrm $1, (%rax), %xmm2 +# CHECK-NEXT: 7 7 3.00 pcmpestrm $1, %xmm0, %xmm2 +# CHECK-NEXT: 12 14 3.00 * pcmpestrm $1, (%rax), %xmm2 # CHECK-NEXT: 4 2 2.00 pcmpistri $1, %xmm0, %xmm2 # CHECK-NEXT: 4 9 2.00 * pcmpistri $1, (%rax), %xmm2 -# CHECK-NEXT: 3 6 2.00 pcmpistrm $1, %xmm0, %xmm2 -# CHECK-NEXT: 4 13 2.00 * pcmpistrm $1, (%rax), %xmm2 +# CHECK-NEXT: 3 7 2.00 pcmpistrm $1, %xmm0, %xmm2 +# CHECK-NEXT: 4 14 2.00 * pcmpistrm $1, (%rax), %xmm2 # CHECK-NEXT: 1 1 0.25 pcmpgtq %xmm0, %xmm2 # CHECK-NEXT: 1 8 0.50 * pcmpgtq (%rax), %xmm2 diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-vpclmulqdq.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-vpclmulqdq.s index 55a36d0..dca4703 100644 --- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-vpclmulqdq.s +++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-vpclmulqdq.s @@ -13,8 +13,8 @@ vpclmulqdq $11, (%rax), %ymm1, %ymm3 # CHECK-NEXT: [6]: HasSideEffects (U) # CHECK: [1] [2] [3] [4] [5] [6] Instructions: -# CHECK-NEXT: 4 4 2.00 vpclmulqdq $11, %ymm0, %ymm1, %ymm3 -# CHECK-NEXT: 4 11 2.00 * vpclmulqdq $11, (%rax), %ymm1, %ymm3 +# CHECK-NEXT: 4 4 1.50 vpclmulqdq $11, %ymm0, %ymm1, %ymm3 +# CHECK-NEXT: 4 11 1.50 * vpclmulqdq $11, (%rax), %ymm1, %ymm3 # CHECK: Resources: # CHECK-NEXT: [0] - Zn4AGU0 @@ -43,9 +43,9 @@ vpclmulqdq $11, (%rax), %ymm1, %ymm3 # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] -# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions: -# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - - - - - - - - - - - - vpclmulqdq $11, %ymm0, %ymm1, %ymm3 -# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpclmulqdq $11, (%rax), %ymm1, %ymm3 +# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - - - - - - - - - - - - vpclmulqdq $11, %ymm0, %ymm1, %ymm3 +# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpclmulqdq $11, (%rax), %ymm1, %ymm3 diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-x86_64.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-x86_64.s index 9c5b4e4..886d9c6 100644 --- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-x86_64.s +++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-x86_64.s @@ -1173,18 +1173,18 @@ xorq (%rax), %rdi # CHECK-NEXT: 1 6 0.67 * * andq %rsi, (%rax) # CHECK-NEXT: 1 6 0.67 * * lock andq %rsi, (%rax) # CHECK-NEXT: 1 5 0.33 * andq (%rax), %rdi -# CHECK-NEXT: 6 1 1.00 bsfw %si, %di -# CHECK-NEXT: 6 1 1.00 bsrw %si, %di -# CHECK-NEXT: 7 5 1.00 * bsfw (%rax), %di -# CHECK-NEXT: 7 5 1.00 * bsrw (%rax), %di -# CHECK-NEXT: 6 1 1.00 bsfl %esi, %edi -# CHECK-NEXT: 6 1 1.00 bsrl %esi, %edi -# CHECK-NEXT: 7 5 1.00 * bsfl (%rax), %edi -# CHECK-NEXT: 7 5 1.00 * bsrl (%rax), %edi -# CHECK-NEXT: 6 1 1.00 bsfq %rsi, %rdi -# CHECK-NEXT: 6 1 1.00 bsrq %rsi, %rdi -# CHECK-NEXT: 7 5 1.00 * bsfq (%rax), %rdi -# CHECK-NEXT: 7 5 1.00 * bsrq (%rax), %rdi +# CHECK-NEXT: 1 1 1.00 bsfw %si, %di +# CHECK-NEXT: 1 1 1.00 bsrw %si, %di +# CHECK-NEXT: 2 5 1.00 * bsfw (%rax), %di +# CHECK-NEXT: 2 5 1.00 * bsrw (%rax), %di +# CHECK-NEXT: 1 1 1.00 bsfl %esi, %edi +# CHECK-NEXT: 1 1 1.00 bsrl %esi, %edi +# CHECK-NEXT: 2 5 1.00 * bsfl (%rax), %edi +# CHECK-NEXT: 2 5 1.00 * bsrl (%rax), %edi +# CHECK-NEXT: 1 1 1.00 bsfq %rsi, %rdi +# CHECK-NEXT: 1 1 1.00 bsrq %rsi, %rdi +# CHECK-NEXT: 2 5 1.00 * bsfq (%rax), %rdi +# CHECK-NEXT: 2 5 1.00 * bsrq (%rax), %rdi # CHECK-NEXT: 1 1 0.25 bswapl %eax # CHECK-NEXT: 1 1 0.25 bswapq %rax # CHECK-NEXT: 1 1 0.50 btw %si, %di @@ -1321,23 +1321,23 @@ xorq (%rax), %rdi # CHECK-NEXT: 1 1 0.25 decq %rdi # CHECK-NEXT: 1 6 0.67 * * decq (%rax) # CHECK-NEXT: 1 6 0.67 * * lock decq (%rax) -# CHECK-NEXT: 2 10 10.00 U divb %dil -# CHECK-NEXT: 2 14 10.00 * U divb (%rax) -# CHECK-NEXT: 2 11 11.00 U divw %si -# CHECK-NEXT: 2 15 11.00 * U divw (%rax) -# CHECK-NEXT: 2 13 13.00 U divl %edx -# CHECK-NEXT: 2 17 13.00 * U divl (%rax) -# CHECK-NEXT: 2 17 17.00 U divq %rcx -# CHECK-NEXT: 2 21 17.00 * U divq (%rax) +# CHECK-NEXT: 2 9 9.00 U divb %dil +# CHECK-NEXT: 2 13 9.00 * U divb (%rax) +# CHECK-NEXT: 2 10 10.00 U divw %si +# CHECK-NEXT: 2 14 10.00 * U divw (%rax) +# CHECK-NEXT: 2 12 12.00 U divl %edx +# CHECK-NEXT: 2 16 12.00 * U divl (%rax) +# CHECK-NEXT: 2 18 18.00 U divq %rcx +# CHECK-NEXT: 2 22 18.00 * U divq (%rax) # CHECK-NEXT: 100 100 25.00 U enter $7, $4095 -# CHECK-NEXT: 2 10 10.00 U idivb %dil -# CHECK-NEXT: 2 14 10.00 * U idivb (%rax) -# CHECK-NEXT: 2 11 11.00 U idivw %si -# CHECK-NEXT: 2 15 11.00 * U idivw (%rax) -# CHECK-NEXT: 2 13 13.00 U idivl %edx -# CHECK-NEXT: 2 17 13.00 * U idivl (%rax) -# CHECK-NEXT: 2 17 17.00 U idivq %rcx -# CHECK-NEXT: 2 21 17.00 * U idivq (%rax) +# CHECK-NEXT: 2 9 9.00 U idivb %dil +# CHECK-NEXT: 2 13 9.00 * U idivb (%rax) +# CHECK-NEXT: 2 10 10.00 U idivw %si +# CHECK-NEXT: 2 14 10.00 * U idivw (%rax) +# CHECK-NEXT: 2 12 12.00 U idivl %edx +# CHECK-NEXT: 2 16 12.00 * U idivl (%rax) +# CHECK-NEXT: 2 18 18.00 U idivq %rcx +# CHECK-NEXT: 2 22 18.00 * U idivq (%rax) # CHECK-NEXT: 1 3 3.00 imulb %dil # CHECK-NEXT: 1 7 3.00 * imulb (%rax) # CHECK-NEXT: 3 3 3.00 imulw %di @@ -1891,12 +1891,12 @@ xorq (%rax), %rdi # CHECK-NEXT: 1 5 0.67 * * xaddq %rax, (%rbx) # CHECK-NEXT: 1 5 0.67 * * lock xaddq %rax, (%rbx) # CHECK-NEXT: 2 1 0.50 xchgb %bl, %cl -# CHECK-NEXT: 5 7 0.50 * * xchgb %bl, (%rbx) -# CHECK-NEXT: 5 7 0.50 * * lock xchgb %bl, (%rbx) +# CHECK-NEXT: 2 7 0.50 * * xchgb %bl, (%rbx) +# CHECK-NEXT: 2 7 0.50 * * lock xchgb %bl, (%rbx) # CHECK-NEXT: 2 1 0.50 xchgw %bx, %ax # CHECK-NEXT: 2 1 0.50 xchgw %bx, %cx -# CHECK-NEXT: 5 7 0.50 * * xchgw %ax, (%rbx) -# CHECK-NEXT: 5 7 0.50 * * lock xchgw %ax, (%rbx) +# CHECK-NEXT: 2 7 0.50 * * xchgw %ax, (%rbx) +# CHECK-NEXT: 2 7 0.50 * * lock xchgw %ax, (%rbx) # CHECK-NEXT: 2 0 0.33 xchgl %ebx, %eax # CHECK-NEXT: 2 0 0.33 xchgl %ebx, %ecx # CHECK-NEXT: 2 6 0.50 * * xchgl %eax, (%rbx) @@ -1975,7 +1975,7 @@ xorq (%rax), %rdi # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] -# CHECK-NEXT: 259.00 259.00 259.00 1733.00 1865.50 1775.50 1529.50 1.50 - - - - - - - 259.00 259.00 259.00 151.67 151.67 151.67 161.00 161.00 +# CHECK-NEXT: 259.00 259.00 259.00 1725.00 1865.50 1775.50 1529.50 1.50 - - - - - - - 259.00 259.00 259.00 151.67 151.67 151.67 161.00 161.00 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions: @@ -2266,23 +2266,23 @@ xorq (%rax), %rdi # CHECK-NEXT: - - - 0.25 0.25 0.25 0.25 - - - - - - - - - - - - - - - - decq %rdi # CHECK-NEXT: 0.67 0.67 0.67 0.25 0.25 0.25 0.25 - - - - - - - - 0.67 0.67 0.67 0.33 0.33 0.33 0.50 0.50 decq (%rax) # CHECK-NEXT: 0.67 0.67 0.67 0.25 0.25 0.25 0.25 - - - - - - - - 0.67 0.67 0.67 0.33 0.33 0.33 0.50 0.50 lock decq (%rax) -# CHECK-NEXT: - - - 10.00 - - - - - - - - - - - - - - - - - - - divb %dil -# CHECK-NEXT: 0.33 0.33 0.33 10.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - divb (%rax) -# CHECK-NEXT: - - - 11.00 - - - - - - - - - - - - - - - - - - - divw %si -# CHECK-NEXT: 0.33 0.33 0.33 11.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - divw (%rax) -# CHECK-NEXT: - - - 13.00 - - - - - - - - - - - - - - - - - - - divl %edx -# CHECK-NEXT: 0.33 0.33 0.33 13.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - divl (%rax) -# CHECK-NEXT: - - - 17.00 - - - - - - - - - - - - - - - - - - - divq %rcx -# CHECK-NEXT: 0.33 0.33 0.33 17.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - divq (%rax) +# CHECK-NEXT: - - - 9.00 - - - - - - - - - - - - - - - - - - - divb %dil +# CHECK-NEXT: 0.33 0.33 0.33 9.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - divb (%rax) +# CHECK-NEXT: - - - 10.00 - - - - - - - - - - - - - - - - - - - divw %si +# CHECK-NEXT: 0.33 0.33 0.33 10.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - divw (%rax) +# CHECK-NEXT: - - - 12.00 - - - - - - - - - - - - - - - - - - - divl %edx +# CHECK-NEXT: 0.33 0.33 0.33 12.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - divl (%rax) +# CHECK-NEXT: - - - 18.00 - - - - - - - - - - - - - - - - - - - divq %rcx +# CHECK-NEXT: 0.33 0.33 0.33 18.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - divq (%rax) # CHECK-NEXT: - - - 25.00 25.00 25.00 25.00 - - - - - - - - - - - - - - - - enter $7, $4095 -# CHECK-NEXT: - - - 10.00 - - - - - - - - - - - - - - - - - - - idivb %dil -# CHECK-NEXT: 0.33 0.33 0.33 10.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - idivb (%rax) -# CHECK-NEXT: - - - 11.00 - - - - - - - - - - - - - - - - - - - idivw %si -# CHECK-NEXT: 0.33 0.33 0.33 11.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - idivw (%rax) -# CHECK-NEXT: - - - 13.00 - - - - - - - - - - - - - - - - - - - idivl %edx -# CHECK-NEXT: 0.33 0.33 0.33 13.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - idivl (%rax) -# CHECK-NEXT: - - - 17.00 - - - - - - - - - - - - - - - - - - - idivq %rcx -# CHECK-NEXT: 0.33 0.33 0.33 17.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - idivq (%rax) +# CHECK-NEXT: - - - 9.00 - - - - - - - - - - - - - - - - - - - idivb %dil +# CHECK-NEXT: 0.33 0.33 0.33 9.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - idivb (%rax) +# CHECK-NEXT: - - - 10.00 - - - - - - - - - - - - - - - - - - - idivw %si +# CHECK-NEXT: 0.33 0.33 0.33 10.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - idivw (%rax) +# CHECK-NEXT: - - - 12.00 - - - - - - - - - - - - - - - - - - - idivl %edx +# CHECK-NEXT: 0.33 0.33 0.33 12.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - idivl (%rax) +# CHECK-NEXT: - - - 18.00 - - - - - - - - - - - - - - - - - - - idivq %rcx +# CHECK-NEXT: 0.33 0.33 0.33 18.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - idivq (%rax) # CHECK-NEXT: - - - - 3.00 - - - - - - - - - - - - - - - - - - imulb %dil # CHECK-NEXT: 0.33 0.33 0.33 - 3.00 - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - imulb (%rax) # CHECK-NEXT: - - - - 3.00 - - - - - - - - - - - - - - - - - - imulw %di diff --git a/llvm/test/tools/llvm-objdump/MachO/disassemble-source-dsym.test b/llvm/test/tools/llvm-objdump/MachO/disassemble-source-dsym.test index aaaf6bf..9899dc5 100644 --- a/llvm/test/tools/llvm-objdump/MachO/disassemble-source-dsym.test +++ b/llvm/test/tools/llvm-objdump/MachO/disassemble-source-dsym.test @@ -13,4 +13,35 @@ # RUN: dsymutil -f -oso-prepend-path=%p/../../dsymutil/ %t3 -o %t3.dSYM # RUN: llvm-objdump --source --prefix=%p/../../dsymutil %t3 | FileCheck --check-prefix=SOURCE %s +## Test that --source works with --macho flag. + +## --macho w/ explicit .dSYM +# RUN: llvm-objdump < %p/../../dsymutil/Inputs/basic.macho.x86_64 - --source --macho --dsym=%t1.dSYM --prefix=%p/../../dsymutil | \ +# RUN: FileCheck --check-prefix=SOURCE %s + +## --macho w/ auto-detected .dSYM (dir) +# RUN: llvm-objdump --source --macho --prefix=%p/../../dsymutil %t2 | FileCheck --check-prefix=SOURCE %s + +## --macho w/ auto-detected .dSYM (file) +# RUN: llvm-objdump --source --macho --prefix=%p/../../dsymutil %t3 | FileCheck --check-prefix=SOURCE %s + # SOURCE: ; int bar(int arg) { + +## Test that --line-numbers works with --macho flag. + +## --macho -l w/ explicit .dSYM +# RUN: llvm-objdump -d -l --macho --dsym=%t1.dSYM %p/../../dsymutil/Inputs/basic.macho.x86_64 | FileCheck --check-prefix=LINE %s + +## --macho -l w/ object file (embedded debug info) +# RUN: llvm-objdump -d -l --macho %p/../../dsymutil/Inputs/basic1.macho.x86_64.o | FileCheck --check-prefix=LINE_OBJ %s + +# LINE: (__TEXT,__text) section +# LINE: _bar: +# LINE: ; bar(): +# LINE: ; {{.*}}basic3.c: + +# LINE_OBJ: (__TEXT,__text) section +# LINE_OBJ: _main: +# LINE_OBJ: ; main(): +# LINE_OBJ: ; {{.*}}basic1.c:23 +# LINE_OBJ: pushq %rbp ## basic1.c:23:0 diff --git a/llvm/test/tools/llvm-readobj/ELF/section-types.test b/llvm/test/tools/llvm-readobj/ELF/section-types.test index 904892a..12a9d05 100644 --- a/llvm/test/tools/llvm-readobj/ELF/section-types.test +++ b/llvm/test/tools/llvm-readobj/ELF/section-types.test @@ -63,6 +63,8 @@ # LLVM: Type: SHT_LLVM_PART_PHDR # LLVM: Name: .llvm.lto # LLVM: Type: SHT_LLVM_LTO +# LLVM: Name: .llvm.callgraph +# LLVM: Type: SHT_LLVM_CALL_GRAPH # LLVM: Name: gnu_sframe # LLVM: Type: SHT_GNU_SFRAME # LLVM: Name: gnu_attributes @@ -127,6 +129,7 @@ # GNU-NEXT: part1 LLVM_PART_EHDR # GNU-NEXT: .phdrs LLVM_PART_PHDR # GNU-NEXT: .llvm.lto LLVM_LTO +# GNU-NEXT: .llvm.callgraph LLVM_CALL_GRAPH # GNU-NEXT: gnu_sframe SFRAME # GNU-NEXT: gnu_attributes ATTRIBUTES # GNU-NEXT: gnu_hash GNU_HASH @@ -218,6 +221,8 @@ Sections: Type: SHT_LLVM_PART_PHDR - Name: .llvm.lto Type: SHT_LLVM_LTO + - Name: .llvm.callgraph + Type: SHT_LLVM_CALL_GRAPH - Name: gnu_sframe Type: SHT_GNU_SFRAME - Name: gnu_attributes |